src/std/utf.d

181254a7Smrg// Written in the D programming language.
181254a7Smrg
181254a7Smrg/++
181254a7Smrg    Encode and decode UTF-8, UTF-16 and UTF-32 strings.
181254a7Smrg
181254a7Smrg    UTF character support is restricted to
181254a7Smrg    $(D '\u0000' &lt;= character &lt;= '\U0010FFFF').
181254a7Smrg
181254a7Smrg$(SCRIPT inhibitQuickIndex = 1;)
*b1e83836Smrg$(DIVC quickindex,
181254a7Smrg$(BOOKTABLE,
181254a7Smrg$(TR $(TH Category) $(TH Functions))
181254a7Smrg$(TR $(TD Decode) $(TD
181254a7Smrg    $(LREF decode)
181254a7Smrg    $(LREF decodeFront)
181254a7Smrg))
181254a7Smrg$(TR $(TD Lazy decode) $(TD
181254a7Smrg    $(LREF byCodeUnit)
181254a7Smrg    $(LREF byChar)
181254a7Smrg    $(LREF byWchar)
181254a7Smrg    $(LREF byDchar)
181254a7Smrg    $(LREF byUTF)
181254a7Smrg))
181254a7Smrg$(TR $(TD Encode) $(TD
181254a7Smrg    $(LREF encode)
181254a7Smrg    $(LREF toUTF8)
181254a7Smrg    $(LREF toUTF16)
181254a7Smrg    $(LREF toUTF32)
181254a7Smrg    $(LREF toUTFz)
181254a7Smrg    $(LREF toUTF16z)
181254a7Smrg))
181254a7Smrg$(TR $(TD Length) $(TD
181254a7Smrg    $(LREF codeLength)
181254a7Smrg    $(LREF count)
181254a7Smrg    $(LREF stride)
181254a7Smrg    $(LREF strideBack)
181254a7Smrg))
181254a7Smrg$(TR $(TD Index) $(TD
181254a7Smrg    $(LREF toUCSindex)
181254a7Smrg    $(LREF toUTFindex)
181254a7Smrg))
181254a7Smrg$(TR $(TD Validation) $(TD
181254a7Smrg    $(LREF isValidDchar)
*b1e83836Smrg    $(LREF isValidCodepoint)
181254a7Smrg    $(LREF validate)
181254a7Smrg))
181254a7Smrg$(TR $(TD Miscellaneous) $(TD
181254a7Smrg    $(LREF replacementDchar)
181254a7Smrg    $(LREF UseReplacementDchar)
181254a7Smrg    $(LREF UTFException)
181254a7Smrg))
*b1e83836Smrg))
181254a7Smrg    See_Also:
181254a7Smrg        $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
181254a7Smrg        $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
181254a7Smrg        $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
*b1e83836Smrg    Copyright: Copyright The D Language Foundation 2000 - 2012.
181254a7Smrg    License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
*b1e83836Smrg    Authors:   $(HTTP digitalmars.com, Walter Bright) and
*b1e83836Smrg               $(HTTP jmdavisprog.com, Jonathan M Davis)
*b1e83836Smrg    Source:    $(PHOBOSSRC std/utf.d)
181254a7Smrg   +/
181254a7Smrgmodule std.utf;
181254a7Smrg
*b1e83836Smrgimport std.exception : basicExceptionCtors;
*b1e83836Smrgimport core.exception : UnicodeException;
*b1e83836Smrgimport std.meta : AliasSeq;
*b1e83836Smrgimport std.range;
*b1e83836Smrgimport std.traits : isAutodecodableString, isConvertibleToString, isPointer,
*b1e83836Smrg    isSomeChar, isSomeString, isStaticArray, Unqual;
*b1e83836Smrgimport std.typecons : Flag, Yes, No;
181254a7Smrg
181254a7Smrg
181254a7Smrg/++
181254a7Smrg    Exception thrown on errors in std.utf functions.
181254a7Smrg  +/
*b1e83836Smrgclass UTFException : UnicodeException
181254a7Smrg{
181254a7Smrg    import core.internal.string : unsignedToTempString, UnsignedStringBuf;
181254a7Smrg
181254a7Smrg    uint[4] sequence;
181254a7Smrg    size_t  len;
181254a7Smrg
181254a7Smrg    @safe pure nothrow @nogc
*b1e83836Smrg    UTFException setSequence(scope uint[] data...) return
181254a7Smrg    {
181254a7Smrg        assert(data.length <= 4);
181254a7Smrg
181254a7Smrg        len = data.length < 4 ? data.length : 4;
181254a7Smrg        sequence[0 .. len] = data[0 .. len];
181254a7Smrg
181254a7Smrg        return this;
181254a7Smrg    }
181254a7Smrg
*b1e83836Smrg    // FIXME: Use std.exception.basicExceptionCtors here once
*b1e83836Smrg    // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed
181254a7Smrg
*b1e83836Smrg    /**
*b1e83836Smrg    Standard exception constructors.
*b1e83836Smrg     */
181254a7Smrg    this(string msg, string file = __FILE__, size_t line = __LINE__,
181254a7Smrg         Throwable next = null) @nogc @safe pure nothrow
181254a7Smrg    {
*b1e83836Smrg        super(msg, 0, file, line, next);
181254a7Smrg    }
*b1e83836Smrg    /// ditto
181254a7Smrg    this(string msg, size_t index, string file = __FILE__,
181254a7Smrg         size_t line = __LINE__, Throwable next = null) @safe pure nothrow
181254a7Smrg    {
181254a7Smrg        UnsignedStringBuf buf = void;
*b1e83836Smrg        msg ~= " (at index " ~ unsignedToTempString(index, buf) ~ ")";
*b1e83836Smrg        super(msg, index, file, line, next);
181254a7Smrg    }
181254a7Smrg
*b1e83836Smrg    /**
*b1e83836Smrg    Returns:
*b1e83836Smrg        A `string` detailing the invalid UTF sequence.
*b1e83836Smrg     */
181254a7Smrg    override string toString() const
181254a7Smrg    {
181254a7Smrg        if (len == 0)
181254a7Smrg        {
181254a7Smrg            /* Exception.toString() is not marked as const, although
181254a7Smrg             * it is const-compatible.
181254a7Smrg             */
181254a7Smrg            //return super.toString();
181254a7Smrg            auto e = () @trusted { return cast(Exception) super; } ();
181254a7Smrg            return e.toString();
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        string result = "Invalid UTF sequence:";
181254a7Smrg
181254a7Smrg        foreach (i; sequence[0 .. len])
181254a7Smrg        {
181254a7Smrg            UnsignedStringBuf buf = void;
181254a7Smrg            result ~= ' ';
*b1e83836Smrg            auto h = unsignedToTempString!16(i, buf);
181254a7Smrg            if (h.length == 1)
181254a7Smrg                result ~= '0';
181254a7Smrg            result ~= h;
181254a7Smrg            result ~= 'x';
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        if (super.msg.length > 0)
181254a7Smrg        {
181254a7Smrg            result ~= " - ";
181254a7Smrg            result ~= super.msg;
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        return result;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe unittest
*b1e83836Smrg{
*b1e83836Smrg    import std.exception : assertThrown;
*b1e83836Smrg
*b1e83836Smrg    char[4] buf;
*b1e83836Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
*b1e83836Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
*b1e83836Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
*b1e83836Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
*b1e83836Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
*b1e83836Smrg}
*b1e83836Smrg
181254a7Smrg/*
181254a7Smrg   Provide array of invalidly encoded UTF strings. Useful for testing.
181254a7Smrg
181254a7Smrg   Params:
181254a7Smrg        Char = char, wchar, or dchar
181254a7Smrg
181254a7Smrg   Returns:
181254a7Smrg        an array of invalidly encoded UTF strings
181254a7Smrg */
181254a7Smrg
181254a7Smrgpackage auto invalidUTFstrings(Char)() @safe pure @nogc nothrow
181254a7Smrgif (isSomeChar!Char)
181254a7Smrg{
181254a7Smrg    static if (is(Char == char))
181254a7Smrg    {
181254a7Smrg        enum x = 0xDC00;         // invalid surrogate value
181254a7Smrg        enum y = 0x110000;       // out of range
181254a7Smrg
181254a7Smrg        static immutable string[8] result =
181254a7Smrg        [
181254a7Smrg            "\x80",             // not a start byte
181254a7Smrg            "\xC0",             // truncated
181254a7Smrg            "\xC0\xC0",         // invalid continuation
181254a7Smrg            "\xF0\x82\x82\xAC", // overlong
181254a7Smrg            [
181254a7Smrg              0xE0 | (x >> 12),
181254a7Smrg              0x80 | ((x >> 6) & 0x3F),
181254a7Smrg              0x80 | (x & 0x3F)
181254a7Smrg            ],
181254a7Smrg            [
181254a7Smrg              cast(char)(0xF0 | (y >> 18)),
181254a7Smrg              cast(char)(0x80 | ((y >> 12) & 0x3F)),
181254a7Smrg              cast(char)(0x80 | ((y >> 6) & 0x3F)),
181254a7Smrg              cast(char)(0x80 | (y & 0x3F))
181254a7Smrg            ],
181254a7Smrg            [
181254a7Smrg              cast(char)(0xF8 | 3),     // 5 byte encoding
181254a7Smrg              cast(char)(0x80 | 3),
181254a7Smrg              cast(char)(0x80 | 3),
181254a7Smrg              cast(char)(0x80 | 3),
181254a7Smrg              cast(char)(0x80 | 3),
181254a7Smrg            ],
181254a7Smrg            [
181254a7Smrg              cast(char)(0xFC | 3),     // 6 byte encoding
181254a7Smrg              cast(char)(0x80 | 3),
181254a7Smrg              cast(char)(0x80 | 3),
181254a7Smrg              cast(char)(0x80 | 3),
181254a7Smrg              cast(char)(0x80 | 3),
181254a7Smrg              cast(char)(0x80 | 3),
181254a7Smrg            ],
181254a7Smrg        ];
181254a7Smrg
181254a7Smrg        return result[];
181254a7Smrg    }
181254a7Smrg    else static if (is(Char == wchar))
181254a7Smrg    {
181254a7Smrg        static immutable wstring[5] result =
181254a7Smrg        [
181254a7Smrg            [
181254a7Smrg              cast(wchar) 0xDC00,
181254a7Smrg            ],
181254a7Smrg            [
181254a7Smrg              cast(wchar) 0xDFFF,
181254a7Smrg            ],
181254a7Smrg            [
181254a7Smrg              cast(wchar) 0xDBFF,
181254a7Smrg              cast(wchar) 0xDBFF,
181254a7Smrg            ],
181254a7Smrg            [
181254a7Smrg              cast(wchar) 0xDBFF,
181254a7Smrg              cast(wchar) 0xE000,
181254a7Smrg            ],
181254a7Smrg            [
181254a7Smrg              cast(wchar) 0xD800,
181254a7Smrg            ],
181254a7Smrg        ];
181254a7Smrg
181254a7Smrg        return result[];
181254a7Smrg    }
181254a7Smrg    else static if (is(Char == dchar))
181254a7Smrg    {
181254a7Smrg        static immutable dstring[3] result =
181254a7Smrg        [
181254a7Smrg            [ cast(dchar) 0x110000 ],
181254a7Smrg            [ cast(dchar) 0x00D800 ],
181254a7Smrg            [ cast(dchar) 0x00DFFF ],
181254a7Smrg        ];
181254a7Smrg
181254a7Smrg        return result;
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg        static assert(0);
181254a7Smrg}
181254a7Smrg
181254a7Smrg/++
181254a7Smrg    Check whether the given Unicode code point is valid.
181254a7Smrg
181254a7Smrg    Params:
181254a7Smrg        c = code point to check
181254a7Smrg
181254a7Smrg    Returns:
*b1e83836Smrg        `true` if and only if `c` is a valid Unicode code point
181254a7Smrg
181254a7Smrg    Note:
*b1e83836Smrg    `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`,
181254a7Smrg    as they are permitted for internal use by an application, but they are
181254a7Smrg    not allowed for interchange by the Unicode standard.
181254a7Smrg  +/
181254a7Smrgbool isValidDchar(dchar c) pure nothrow @safe @nogc
181254a7Smrg{
181254a7Smrg    return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe @nogc pure nothrow unittest
*b1e83836Smrg{
*b1e83836Smrg    assert( isValidDchar(cast(dchar) 0x41));
*b1e83836Smrg    assert( isValidDchar(cast(dchar) 0x00));
*b1e83836Smrg    assert(!isValidDchar(cast(dchar) 0xD800));
*b1e83836Smrg    assert(!isValidDchar(cast(dchar) 0x11FFFF));
*b1e83836Smrg}
*b1e83836Smrg
181254a7Smrgpure nothrow @safe @nogc unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    assert( isValidDchar(cast(dchar)'a') == true);
181254a7Smrg    assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
181254a7Smrg
181254a7Smrg    assert(!isValidDchar(cast(dchar) 0x00D800));
181254a7Smrg    assert(!isValidDchar(cast(dchar) 0x00DBFF));
181254a7Smrg    assert(!isValidDchar(cast(dchar) 0x00DC00));
181254a7Smrg    assert(!isValidDchar(cast(dchar) 0x00DFFF));
181254a7Smrg    assert( isValidDchar(cast(dchar) 0x00FFFE));
181254a7Smrg    assert( isValidDchar(cast(dchar) 0x00FFFF));
181254a7Smrg    assert( isValidDchar(cast(dchar) 0x01FFFF));
181254a7Smrg    assert( isValidDchar(cast(dchar) 0x10FFFF));
181254a7Smrg    assert(!isValidDchar(cast(dchar) 0x110000));
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg/**
*b1e83836SmrgChecks if a single character forms a valid code point.
181254a7Smrg
*b1e83836SmrgWhen standing alone, some characters are invalid code points. For
*b1e83836Smrgexample the `wchar` `0xD800` is a so called high surrogate, which can
*b1e83836Smrgonly be interpreted together with a low surrogate following it. As a
*b1e83836Smrgstandalone character it is considered invalid.
*b1e83836Smrg
*b1e83836SmrgSee $(LINK2 http://www.unicode.org/versions/Unicode13.0.0/,
*b1e83836SmrgUnicode Standard, D90, D91 and D92) for more details.
181254a7Smrg
181254a7SmrgParams:
*b1e83836Smrg    c = character to test
*b1e83836Smrg    Char = character type of `c`
*b1e83836Smrg
*b1e83836SmrgReturns:
*b1e83836Smrg    `true`, if `c` forms a valid code point.
*b1e83836Smrg */
*b1e83836Smrgbool isValidCodepoint(Char)(Char c)
*b1e83836Smrgif (isSomeChar!Char)
*b1e83836Smrg{
*b1e83836Smrg    alias UChar = Unqual!Char;
*b1e83836Smrg    static if (is(UChar == char))
*b1e83836Smrg    {
*b1e83836Smrg        return c <= 0x7F;
*b1e83836Smrg    }
*b1e83836Smrg    else static if (is(UChar == wchar))
*b1e83836Smrg    {
*b1e83836Smrg        return c <= 0xD7FF || c >= 0xE000;
*b1e83836Smrg    }
*b1e83836Smrg    else static if (is(UChar == dchar))
*b1e83836Smrg    {
*b1e83836Smrg        return isValidDchar(c);
*b1e83836Smrg    }
*b1e83836Smrg    else
*b1e83836Smrg        static assert(false, "unknown character type: `" ~ Char.stringof ~ "`");
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe pure nothrow unittest
*b1e83836Smrg{
*b1e83836Smrg    assert( isValidCodepoint(cast(char) 0x40));
*b1e83836Smrg    assert(!isValidCodepoint(cast(char) 0x80));
*b1e83836Smrg    assert( isValidCodepoint(cast(wchar) 0x1234));
*b1e83836Smrg    assert(!isValidCodepoint(cast(wchar) 0xD800));
*b1e83836Smrg    assert( isValidCodepoint(cast(dchar) 0x0010FFFF));
*b1e83836Smrg    assert(!isValidCodepoint(cast(dchar) 0x12345678));
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrg/++
*b1e83836Smrg    Calculate the length of the UTF sequence starting at `index`
*b1e83836Smrg    in `str`.
*b1e83836Smrg
*b1e83836Smrg    Params:
*b1e83836Smrg        str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
*b1e83836Smrg        of UTF code units. Must be random access if `index` is passed
*b1e83836Smrg        index = starting index of UTF sequence (default: `0`)
181254a7Smrg
181254a7Smrg    Returns:
181254a7Smrg        The number of code units in the UTF sequence. For UTF-8, this is a
181254a7Smrg        value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
181254a7Smrg        For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
181254a7Smrg
181254a7Smrg    Throws:
*b1e83836Smrg        May throw a `UTFException` if `str[index]` is not the start of a
181254a7Smrg        valid UTF sequence.
181254a7Smrg
181254a7Smrg    Note:
*b1e83836Smrg        `stride` will only analyze the first `str[index]` element. It
181254a7Smrg        will not fully verify the validity of the UTF sequence, nor even verify
181254a7Smrg        the presence of the sequence: it will not actually guarantee that
181254a7Smrg        $(D index + stride(str, index) <= str.length).
181254a7Smrg  +/
181254a7Smrguint stride(S)(auto ref S str, size_t index)
181254a7Smrgif (is(S : const char[]) ||
*b1e83836Smrg    (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
181254a7Smrg{
181254a7Smrg    static if (is(typeof(str.length) : ulong))
181254a7Smrg        assert(index < str.length, "Past the end of the UTF-8 sequence");
181254a7Smrg    immutable c = str[index];
181254a7Smrg
181254a7Smrg    if (c < 0x80)
181254a7Smrg        return 1;
181254a7Smrg    else
181254a7Smrg        return strideImpl(c, index);
181254a7Smrg}
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrguint stride(S)(auto ref S str)
181254a7Smrgif (is(S : const char[]) ||
*b1e83836Smrg    (isInputRange!S && is(immutable ElementType!S == immutable char)))
181254a7Smrg{
181254a7Smrg    static if (is(S : const char[]))
181254a7Smrg        immutable c = str[0];
181254a7Smrg    else
181254a7Smrg        immutable c = str.front;
181254a7Smrg
181254a7Smrg    if (c < 0x80)
181254a7Smrg        return 1;
181254a7Smrg    else
181254a7Smrg        return strideImpl(c, 0);
181254a7Smrg}
181254a7Smrg
181254a7Smrg@system unittest
181254a7Smrg{
181254a7Smrg    import core.exception : AssertError;
181254a7Smrg    import std.conv : to;
181254a7Smrg    import std.exception;
181254a7Smrg    import std.string : format;
*b1e83836Smrg    import std.traits : FunctionAttribute, functionAttributes, isSafe;
181254a7Smrg    static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__)
181254a7Smrg    {
181254a7Smrg        enforce(stride(s, i) == codeLength!char(c),
181254a7Smrg                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        enforce(stride(RandomCU!char(s), i) == codeLength!char(c),
181254a7Smrg                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        auto refRandom = new RefRandomCU!char(s);
181254a7Smrg        immutable randLen = refRandom.length;
181254a7Smrg        enforce(stride(refRandom, i) == codeLength!char(c),
181254a7Smrg                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
181254a7Smrg        enforce(refRandom.length == randLen,
181254a7Smrg                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        if (i == 0)
181254a7Smrg        {
181254a7Smrg            enforce(stride(s) == codeLength!char(c),
181254a7Smrg                    new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg            enforce(stride(InputCU!char(s)) == codeLength!char(c),
181254a7Smrg                    new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg            auto refBidir = new RefBidirCU!char(s);
181254a7Smrg            immutable bidirLen = refBidir.length;
181254a7Smrg            enforce(stride(refBidir) == codeLength!char(c),
181254a7Smrg                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
181254a7Smrg            enforce(refBidir.length == bidirLen,
181254a7Smrg                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    test("a", 'a');
181254a7Smrg    test(" ", ' ');
181254a7Smrg    test("\u2029", '\u2029'); //paraSep
181254a7Smrg    test("\u0100", '\u0100');
181254a7Smrg    test("\u0430", '\u0430');
181254a7Smrg    test("\U00010143", '\U00010143');
181254a7Smrg    test("abcdefcdef", 'a');
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'h', 0);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'e', 1);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'l', 2);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'l', 3);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'o', 4);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", '\u0100', 9);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
181254a7Smrg
181254a7Smrg    foreach (S; AliasSeq!(char[], const char[], string))
181254a7Smrg    {
181254a7Smrg        enum str = to!S("hello world");
181254a7Smrg        static assert(isSafe!({ stride(str, 0); }));
181254a7Smrg        static assert(isSafe!({ stride(str);    }));
181254a7Smrg        static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0);
181254a7Smrg        static assert((functionAttributes!({ stride(str);    }) & FunctionAttribute.pure_) != 0);
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe unittest // invalid start bytes
181254a7Smrg{
181254a7Smrg    import std.exception : assertThrown;
181254a7Smrg    immutable char[] invalidStartBytes = [
181254a7Smrg        0b1111_1000, // indicating a sequence length of 5
181254a7Smrg        0b1111_1100, // 6
181254a7Smrg        0b1111_1110, // 7
181254a7Smrg        0b1111_1111, // 8
181254a7Smrg        0b1000_0000, // continuation byte
181254a7Smrg    ];
181254a7Smrg    foreach (c; invalidStartBytes)
181254a7Smrg        assertThrown!UTFException(stride([c]));
181254a7Smrg}
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrguint stride(S)(auto ref S str, size_t index)
181254a7Smrgif (is(S : const wchar[]) ||
*b1e83836Smrg    (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
181254a7Smrg{
181254a7Smrg    static if (is(typeof(str.length) : ulong))
181254a7Smrg        assert(index < str.length, "Past the end of the UTF-16 sequence");
181254a7Smrg    immutable uint u = str[index];
181254a7Smrg    return 1 + (u >= 0xD800 && u <= 0xDBFF);
181254a7Smrg}
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrguint stride(S)(auto ref S str) @safe pure
181254a7Smrgif (is(S : const wchar[]))
181254a7Smrg{
181254a7Smrg    return stride(str, 0);
181254a7Smrg}
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrguint stride(S)(auto ref S str)
*b1e83836Smrgif (isInputRange!S && is(immutable ElementType!S == immutable wchar) &&
*b1e83836Smrg    !is(S : const wchar[]))
181254a7Smrg{
181254a7Smrg    assert(!str.empty, "UTF-16 sequence is empty");
181254a7Smrg    immutable uint u = str.front;
181254a7Smrg    return 1 + (u >= 0xD800 && u <= 0xDBFF);
181254a7Smrg}
181254a7Smrg
181254a7Smrg@system unittest
181254a7Smrg{
181254a7Smrg    import core.exception : AssertError;
181254a7Smrg    import std.conv : to;
181254a7Smrg    import std.exception;
181254a7Smrg    import std.string : format;
*b1e83836Smrg    import std.traits : FunctionAttribute, functionAttributes, isSafe;
181254a7Smrg    static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
181254a7Smrg    {
181254a7Smrg        enforce(stride(s, i) == codeLength!wchar(c),
181254a7Smrg                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c),
181254a7Smrg                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        auto refRandom = new RefRandomCU!wchar(s);
181254a7Smrg        immutable randLen = refRandom.length;
181254a7Smrg        enforce(stride(refRandom, i) == codeLength!wchar(c),
181254a7Smrg                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
181254a7Smrg        enforce(refRandom.length == randLen,
181254a7Smrg                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        if (i == 0)
181254a7Smrg        {
181254a7Smrg            enforce(stride(s) == codeLength!wchar(c),
181254a7Smrg                    new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg            enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c),
181254a7Smrg                    new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg            auto refBidir = new RefBidirCU!wchar(s);
181254a7Smrg            immutable bidirLen = refBidir.length;
181254a7Smrg            enforce(stride(refBidir) == codeLength!wchar(c),
181254a7Smrg                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
181254a7Smrg            enforce(refBidir.length == bidirLen,
181254a7Smrg                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    test("a", 'a');
181254a7Smrg    test(" ", ' ');
181254a7Smrg    test("\u2029", '\u2029'); //paraSep
181254a7Smrg    test("\u0100", '\u0100');
181254a7Smrg    test("\u0430", '\u0430');
181254a7Smrg    test("\U00010143", '\U00010143');
181254a7Smrg    test("abcdefcdef", 'a');
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'h', 0);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'e', 1);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'l', 2);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'l', 3);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'o', 4);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", '\u0100', 7);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
181254a7Smrg
181254a7Smrg    foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
181254a7Smrg    {
181254a7Smrg        enum str = to!S("hello world");
181254a7Smrg        static assert(isSafe!(() => stride(str, 0)));
181254a7Smrg        static assert(isSafe!(() => stride(str)   ));
181254a7Smrg        static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
181254a7Smrg        static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrguint stride(S)(auto ref S str, size_t index = 0)
181254a7Smrgif (is(S : const dchar[]) ||
*b1e83836Smrg    (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
181254a7Smrg{
181254a7Smrg    static if (is(typeof(str.length) : ulong))
181254a7Smrg        assert(index < str.length, "Past the end of the UTF-32 sequence");
181254a7Smrg    else
181254a7Smrg        assert(!str.empty, "UTF-32 sequence is empty.");
181254a7Smrg    return 1;
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe unittest
*b1e83836Smrg{
*b1e83836Smrg    assert("a".stride == 1);
*b1e83836Smrg    assert("λ".stride == 2);
*b1e83836Smrg    assert("aλ".stride == 1);
*b1e83836Smrg    assert("aλ".stride(1) == 2);
*b1e83836Smrg    assert("��".stride == 4);
*b1e83836Smrg}
*b1e83836Smrg
181254a7Smrg@system unittest
181254a7Smrg{
181254a7Smrg    import core.exception : AssertError;
181254a7Smrg    import std.conv : to;
181254a7Smrg    import std.exception;
181254a7Smrg    import std.string : format;
*b1e83836Smrg    import std.traits : FunctionAttribute, functionAttributes, isSafe;
181254a7Smrg    static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
181254a7Smrg    {
181254a7Smrg        enforce(stride(s, i) == codeLength!dchar(c),
181254a7Smrg                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c),
181254a7Smrg                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        auto refRandom = new RefRandomCU!dchar(s);
181254a7Smrg        immutable randLen = refRandom.length;
181254a7Smrg        enforce(stride(refRandom, i) == codeLength!dchar(c),
181254a7Smrg                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
181254a7Smrg        enforce(refRandom.length == randLen,
181254a7Smrg                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        if (i == 0)
181254a7Smrg        {
181254a7Smrg            enforce(stride(s) == codeLength!dchar(c),
181254a7Smrg                    new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg            enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c),
181254a7Smrg                    new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg            auto refBidir = new RefBidirCU!dchar(s);
181254a7Smrg            immutable bidirLen = refBidir.length;
181254a7Smrg            enforce(stride(refBidir) == codeLength!dchar(c),
181254a7Smrg                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
181254a7Smrg            enforce(refBidir.length == bidirLen,
181254a7Smrg                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    test("a", 'a');
181254a7Smrg    test(" ", ' ');
181254a7Smrg    test("\u2029", '\u2029'); //paraSep
181254a7Smrg    test("\u0100", '\u0100');
181254a7Smrg    test("\u0430", '\u0430');
181254a7Smrg    test("\U00010143", '\U00010143');
181254a7Smrg    test("abcdefcdef", 'a');
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'h', 0);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'e', 1);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'l', 2);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'l', 3);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", 'o', 4);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", '\u0100', 6);
181254a7Smrg    test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
181254a7Smrg
181254a7Smrg    foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
181254a7Smrg    {
181254a7Smrg        enum str = to!S("hello world");
181254a7Smrg        static assert(isSafe!(() => stride(str, 0)));
181254a7Smrg        static assert(isSafe!(() => stride(str)   ));
181254a7Smrg        static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0);
181254a7Smrg        static assert((functionAttributes!(() => stride(str)   ) & FunctionAttribute.pure_) != 0);
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
*b1e83836Smrgprivate uint strideImpl(char c, size_t index) @trusted pure
*b1e83836Smrgin { assert(c & 0x80); }
*b1e83836Smrgdo
*b1e83836Smrg{
*b1e83836Smrg    import core.bitop : bsr;
*b1e83836Smrg    immutable msbs = 7 - bsr((~uint(c)) & 0xFF);
*b1e83836Smrg    if (c == 0xFF || msbs < 2 || msbs > 4)
*b1e83836Smrg        throw new UTFException("Invalid UTF-8 sequence", index);
*b1e83836Smrg    return msbs;
*b1e83836Smrg}
*b1e83836Smrg
181254a7Smrg/++
181254a7Smrg    Calculate the length of the UTF sequence ending one code unit before
*b1e83836Smrg    `index` in `str`.
181254a7Smrg
181254a7Smrg    Params:
181254a7Smrg        str = bidirectional range of UTF code units. Must be random access if
*b1e83836Smrg        `index` is passed
*b1e83836Smrg        index = index one past end of UTF sequence (default: `str.length`)
181254a7Smrg
181254a7Smrg    Returns:
181254a7Smrg        The number of code units in the UTF sequence. For UTF-8, this is a
181254a7Smrg        value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)).
181254a7Smrg        For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
181254a7Smrg
181254a7Smrg    Throws:
*b1e83836Smrg        May throw a `UTFException` if `str[index]` is not one past the
181254a7Smrg        end of a valid UTF sequence.
181254a7Smrg
181254a7Smrg    Note:
*b1e83836Smrg        `strideBack` will only analyze the element at $(D str[index - 1])
181254a7Smrg        element. It will not fully verify the validity of the UTF sequence, nor
181254a7Smrg        even verify the presence of the sequence: it will not actually
181254a7Smrg        guarantee that $(D strideBack(str, index) <= index).
181254a7Smrg  +/
181254a7Smrguint strideBack(S)(auto ref S str, size_t index)
181254a7Smrgif (is(S : const char[]) ||
*b1e83836Smrg    (isRandomAccessRange!S && is(immutable ElementType!S == immutable char)))
181254a7Smrg{
181254a7Smrg    static if (is(typeof(str.length) : ulong))
181254a7Smrg        assert(index <= str.length, "Past the end of the UTF-8 sequence");
181254a7Smrg    assert(index > 0, "Not the end of the UTF-8 sequence");
181254a7Smrg
181254a7Smrg    if ((str[index-1] & 0b1100_0000) != 0b1000_0000)
181254a7Smrg        return 1;
181254a7Smrg
181254a7Smrg    if (index >= 4) //single verification for most common case
181254a7Smrg    {
*b1e83836Smrg        static foreach (i; 2 .. 5)
181254a7Smrg        {
181254a7Smrg            if ((str[index-i] & 0b1100_0000) != 0b1000_0000)
181254a7Smrg                return i;
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
*b1e83836Smrg        static foreach (i; 2 .. 4)
181254a7Smrg        {
181254a7Smrg            if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000)
181254a7Smrg                return i;
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg    throw new UTFException("Not the end of the UTF sequence", index);
181254a7Smrg}
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrguint strideBack(S)(auto ref S str)
181254a7Smrgif (is(S : const char[]) ||
*b1e83836Smrg    (isRandomAccessRange!S && hasLength!S && is(immutable ElementType!S == immutable char)))
181254a7Smrg{
181254a7Smrg    return strideBack(str, str.length);
181254a7Smrg}
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrguint strideBack(S)(auto ref S str)
*b1e83836Smrgif (isBidirectionalRange!S && is(immutable ElementType!S == immutable char) && !isRandomAccessRange!S)
181254a7Smrg{
181254a7Smrg    assert(!str.empty, "Past the end of the UTF-8 sequence");
181254a7Smrg    auto temp = str.save;
181254a7Smrg    foreach (i; AliasSeq!(1, 2, 3, 4))
181254a7Smrg    {
181254a7Smrg        if ((temp.back & 0b1100_0000) != 0b1000_0000)
181254a7Smrg            return i;
181254a7Smrg        temp.popBack();
181254a7Smrg        if (temp.empty)
181254a7Smrg            break;
181254a7Smrg    }
181254a7Smrg    throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
181254a7Smrg}
181254a7Smrg
181254a7Smrg@system unittest
181254a7Smrg{
181254a7Smrg    import core.exception : AssertError;
181254a7Smrg    import std.conv : to;
181254a7Smrg    import std.exception;
181254a7Smrg    import std.string : format;
*b1e83836Smrg    import std.traits : FunctionAttribute, functionAttributes, isSafe;
181254a7Smrg    static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
181254a7Smrg    {
181254a7Smrg        enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c),
181254a7Smrg                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c),
181254a7Smrg                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        auto refRandom = new RefRandomCU!char(s);
181254a7Smrg        immutable randLen = refRandom.length;
181254a7Smrg        enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c),
181254a7Smrg                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
181254a7Smrg        enforce(refRandom.length == randLen,
181254a7Smrg                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        if (i == size_t.max)
181254a7Smrg        {
181254a7Smrg            enforce(strideBack(s) == codeLength!char(c),
181254a7Smrg                    new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg            enforce(strideBack(BidirCU!char(s)) == codeLength!char(c),
181254a7Smrg                    new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg            auto refBidir = new RefBidirCU!char(s);
181254a7Smrg            immutable bidirLen = refBidir.length;
181254a7Smrg            enforce(strideBack(refBidir) == codeLength!char(c),
181254a7Smrg                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
181254a7Smrg            enforce(refBidir.length == bidirLen,
181254a7Smrg                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    test("a", 'a');
181254a7Smrg    test(" ", ' ');
181254a7Smrg    test("\u2029", '\u2029'); //paraSep
181254a7Smrg    test("\u0100", '\u0100');
181254a7Smrg    test("\u0430", '\u0430');
181254a7Smrg    test("\U00010143", '\U00010143');
181254a7Smrg    test("abcdefcdef", 'f');
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'o', 15);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'l', 14);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'l', 13);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'e', 12);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'h', 11);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", '\u0100', 6);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
181254a7Smrg
181254a7Smrg    foreach (S; AliasSeq!(char[], const char[], string))
181254a7Smrg    {
181254a7Smrg        enum str = to!S("hello world");
181254a7Smrg        static assert(isSafe!({ strideBack(str, 0); }));
181254a7Smrg        static assert(isSafe!({ strideBack(str);    }));
181254a7Smrg        static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0);
181254a7Smrg        static assert((functionAttributes!({ strideBack(str);    }) & FunctionAttribute.pure_) != 0);
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg//UTF-16 is self synchronizing: The length of strideBack can be found from
181254a7Smrg//the value of a single wchar
181254a7Smrg/// Ditto
181254a7Smrguint strideBack(S)(auto ref S str, size_t index)
181254a7Smrgif (is(S : const wchar[]) ||
*b1e83836Smrg    (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar)))
181254a7Smrg{
181254a7Smrg    static if (is(typeof(str.length) : ulong))
181254a7Smrg        assert(index <= str.length, "Past the end of the UTF-16 sequence");
181254a7Smrg    assert(index > 0, "Not the end of a UTF-16 sequence");
181254a7Smrg
181254a7Smrg    immutable c2 = str[index-1];
181254a7Smrg    return 1 + (0xDC00 <= c2 && c2 < 0xE000);
181254a7Smrg}
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrguint strideBack(S)(auto ref S str)
181254a7Smrgif (is(S : const wchar[]) ||
*b1e83836Smrg    (isBidirectionalRange!S && is(immutable ElementType!S == immutable wchar)))
181254a7Smrg{
181254a7Smrg    assert(!str.empty, "UTF-16 sequence is empty");
181254a7Smrg
181254a7Smrg    static if (is(S : const(wchar)[]))
181254a7Smrg        immutable c2 = str[$ - 1];
181254a7Smrg    else
181254a7Smrg        immutable c2 = str.back;
181254a7Smrg
181254a7Smrg    return 1 + (0xDC00 <= c2 && c2 <= 0xE000);
181254a7Smrg}
181254a7Smrg
181254a7Smrg@system unittest
181254a7Smrg{
181254a7Smrg    import core.exception : AssertError;
181254a7Smrg    import std.conv : to;
181254a7Smrg    import std.exception;
181254a7Smrg    import std.string : format;
*b1e83836Smrg    import std.traits : FunctionAttribute, functionAttributes, isSafe;
181254a7Smrg    static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
181254a7Smrg    {
181254a7Smrg        enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c),
181254a7Smrg                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c),
181254a7Smrg                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        auto refRandom = new RefRandomCU!wchar(s);
181254a7Smrg        immutable randLen = refRandom.length;
181254a7Smrg        enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c),
181254a7Smrg                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
181254a7Smrg        enforce(refRandom.length == randLen,
181254a7Smrg                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        if (i == size_t.max)
181254a7Smrg        {
181254a7Smrg            enforce(strideBack(s) == codeLength!wchar(c),
181254a7Smrg                    new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg            enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c),
181254a7Smrg                    new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg            auto refBidir = new RefBidirCU!wchar(s);
181254a7Smrg            immutable bidirLen = refBidir.length;
181254a7Smrg            enforce(strideBack(refBidir) == codeLength!wchar(c),
181254a7Smrg                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
181254a7Smrg            enforce(refBidir.length == bidirLen,
181254a7Smrg                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    test("a", 'a');
181254a7Smrg    test(" ", ' ');
181254a7Smrg    test("\u2029", '\u2029'); //paraSep
181254a7Smrg    test("\u0100", '\u0100');
181254a7Smrg    test("\u0430", '\u0430');
181254a7Smrg    test("\U00010143", '\U00010143');
181254a7Smrg    test("abcdefcdef", 'f');
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'o', 10);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'l', 9);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'l', 8);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'e', 7);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'h', 6);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", '\u0100', 3);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
181254a7Smrg
181254a7Smrg    foreach (S; AliasSeq!(wchar[], const wchar[], wstring))
181254a7Smrg    {
181254a7Smrg        enum str = to!S("hello world");
181254a7Smrg        static assert(isSafe!(() => strideBack(str, 0)));
181254a7Smrg        static assert(isSafe!(() => strideBack(str)   ));
181254a7Smrg        static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
181254a7Smrg        static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrguint strideBack(S)(auto ref S str, size_t index)
*b1e83836Smrgif (isRandomAccessRange!S && is(immutable ElementEncodingType!S == immutable dchar))
181254a7Smrg{
181254a7Smrg    static if (is(typeof(str.length) : ulong))
181254a7Smrg        assert(index <= str.length, "Past the end of the UTF-32 sequence");
181254a7Smrg    assert(index > 0, "Not the end of the UTF-32 sequence");
181254a7Smrg    return 1;
181254a7Smrg}
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrguint strideBack(S)(auto ref S str)
*b1e83836Smrgif (isBidirectionalRange!S && is(immutable ElementEncodingType!S == immutable dchar))
181254a7Smrg{
181254a7Smrg    assert(!str.empty, "Empty UTF-32 sequence");
181254a7Smrg    return 1;
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe unittest
*b1e83836Smrg{
*b1e83836Smrg    assert("a".strideBack == 1);
*b1e83836Smrg    assert("λ".strideBack == 2);
*b1e83836Smrg    assert("aλ".strideBack == 2);
*b1e83836Smrg    assert("aλ".strideBack(1) == 1);
*b1e83836Smrg    assert("��".strideBack == 4);
*b1e83836Smrg}
*b1e83836Smrg
181254a7Smrg@system unittest
181254a7Smrg{
181254a7Smrg    import core.exception : AssertError;
181254a7Smrg    import std.conv : to;
181254a7Smrg    import std.exception;
181254a7Smrg    import std.string : format;
*b1e83836Smrg    import std.traits : FunctionAttribute, functionAttributes, isSafe;
181254a7Smrg    static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
181254a7Smrg    {
181254a7Smrg        enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c),
181254a7Smrg                new AssertError(format("Unit test failure string: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c),
181254a7Smrg                new AssertError(format("Unit test failure range: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        auto refRandom = new RefRandomCU!dchar(s);
181254a7Smrg        immutable randLen = refRandom.length;
181254a7Smrg        enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c),
181254a7Smrg                new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line));
181254a7Smrg        enforce(refRandom.length == randLen,
181254a7Smrg                new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg        if (i == size_t.max)
181254a7Smrg        {
181254a7Smrg            enforce(strideBack(s) == codeLength!dchar(c),
181254a7Smrg                    new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg            enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c),
181254a7Smrg                    new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line));
181254a7Smrg
181254a7Smrg            auto refBidir = new RefBidirCU!dchar(s);
181254a7Smrg            immutable bidirLen = refBidir.length;
181254a7Smrg            enforce(strideBack(refBidir) == codeLength!dchar(c),
181254a7Smrg                    new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line));
181254a7Smrg            enforce(refBidir.length == bidirLen,
181254a7Smrg                    new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line));
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    test("a", 'a');
181254a7Smrg    test(" ", ' ');
181254a7Smrg    test("\u2029", '\u2029'); //paraSep
181254a7Smrg    test("\u0100", '\u0100');
181254a7Smrg    test("\u0430", '\u0430');
181254a7Smrg    test("\U00010143", '\U00010143');
181254a7Smrg    test("abcdefcdef", 'f');
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'o', 8);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'l', 7);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'l', 6);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'e', 5);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", 'h', 4);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", '\u0100', 2);
181254a7Smrg    test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
181254a7Smrg
181254a7Smrg    foreach (S; AliasSeq!(dchar[], const dchar[], dstring))
181254a7Smrg    {
181254a7Smrg        enum str = to!S("hello world");
181254a7Smrg        static assert(isSafe!(() => strideBack(str, 0)));
181254a7Smrg        static assert(isSafe!(() => strideBack(str)   ));
181254a7Smrg        static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0);
181254a7Smrg        static assert((functionAttributes!(() => strideBack(str)   ) & FunctionAttribute.pure_) != 0);
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/++
*b1e83836Smrg    Given `index` into `str` and assuming that `index` is at the start
*b1e83836Smrg    of a UTF sequence, `toUCSindex` determines the number of UCS characters
*b1e83836Smrg    up to `index`. So, `index` is the index of a code unit at the
181254a7Smrg    beginning of a code point, and the return value is how many code points into
181254a7Smrg    the string that that code point is.
181254a7Smrg  +/
181254a7Smrgsize_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure
181254a7Smrgif (isSomeChar!C)
181254a7Smrg{
*b1e83836Smrg    static if (is(immutable C == immutable dchar))
181254a7Smrg        return index;
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        size_t n = 0;
181254a7Smrg        size_t j = 0;
181254a7Smrg
181254a7Smrg        for (; j < index; ++n)
181254a7Smrg            j += stride(str, j);
181254a7Smrg
181254a7Smrg        if (j > index)
181254a7Smrg        {
*b1e83836Smrg            static if (is(immutable C == immutable char))
181254a7Smrg                throw new UTFException("Invalid UTF-8 sequence", index);
181254a7Smrg            else
181254a7Smrg                throw new UTFException("Invalid UTF-16 sequence", index);
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        return n;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg///
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    assert(toUCSindex(`hello world`, 7) == 7);
181254a7Smrg    assert(toUCSindex(`hello world`w, 7) == 7);
181254a7Smrg    assert(toUCSindex(`hello world`d, 7) == 7);
181254a7Smrg
181254a7Smrg    assert(toUCSindex(`Ma Chérie`, 7) == 6);
181254a7Smrg    assert(toUCSindex(`Ma Chérie`w, 7) == 7);
181254a7Smrg    assert(toUCSindex(`Ma Chérie`d, 7) == 7);
181254a7Smrg
181254a7Smrg    assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
181254a7Smrg    assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
181254a7Smrg    assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/++
*b1e83836Smrg    Given a UCS index `n` into `str`, returns the UTF index.
*b1e83836Smrg    So, `n` is how many code points into the string the code point is, and
181254a7Smrg    the array index of the code unit is returned.
181254a7Smrg  +/
181254a7Smrgsize_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure
181254a7Smrgif (isSomeChar!C)
181254a7Smrg{
*b1e83836Smrg    static if (is(immutable C == immutable dchar))
181254a7Smrg    {
181254a7Smrg        return n;
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        size_t i;
181254a7Smrg        while (n--)
181254a7Smrg        {
181254a7Smrg            i += stride(str, i);
181254a7Smrg        }
181254a7Smrg        return i;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg///
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    assert(toUTFindex(`hello world`, 7) == 7);
181254a7Smrg    assert(toUTFindex(`hello world`w, 7) == 7);
181254a7Smrg    assert(toUTFindex(`hello world`d, 7) == 7);
181254a7Smrg
181254a7Smrg    assert(toUTFindex(`Ma Chérie`, 6) == 7);
181254a7Smrg    assert(toUTFindex(`Ma Chérie`w, 7) == 7);
181254a7Smrg    assert(toUTFindex(`Ma Chérie`d, 7) == 7);
181254a7Smrg
181254a7Smrg    assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
181254a7Smrg    assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
181254a7Smrg    assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/* =================== Decode ======================= */
181254a7Smrg
181254a7Smrg/// Whether or not to replace invalid UTF with $(LREF replacementDchar)
181254a7Smrgalias UseReplacementDchar = Flag!"useReplacementDchar";
181254a7Smrg
181254a7Smrg/++
*b1e83836Smrg    Decodes and returns the code point starting at `str[index]`. `index`
181254a7Smrg    is advanced to one past the decoded code point. If the code point is not
*b1e83836Smrg    well-formed, then a `UTFException` is thrown and `index` remains
181254a7Smrg    unchanged.
181254a7Smrg
181254a7Smrg    decode will only work with strings and random access ranges of code units
181254a7Smrg    with length and slicing, whereas $(LREF decodeFront) will work with any
181254a7Smrg    input range of code units.
181254a7Smrg
181254a7Smrg    Params:
181254a7Smrg        useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
181254a7Smrg        str = input string or indexable Range
181254a7Smrg        index = starting index into s[]; incremented by number of code units processed
181254a7Smrg
181254a7Smrg    Returns:
181254a7Smrg        decoded character
181254a7Smrg
181254a7Smrg    Throws:
*b1e83836Smrg        $(LREF UTFException) if `str[index]` is not the start of a valid UTF
*b1e83836Smrg        sequence and useReplacementDchar is `No.useReplacementDchar`
181254a7Smrg  +/
181254a7Smrgdchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index)
181254a7Smrgif (!isSomeString!S &&
181254a7Smrg    isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S))
181254a7Smrgin
181254a7Smrg{
181254a7Smrg    assert(index < str.length, "Attempted to decode past the end of a string");
181254a7Smrg}
181254a7Smrgout (result)
181254a7Smrg{
181254a7Smrg    assert(isValidDchar(result));
181254a7Smrg}
*b1e83836Smrgdo
181254a7Smrg{
181254a7Smrg    if (str[index] < codeUnitLimit!S)
181254a7Smrg        return str[index++];
181254a7Smrg    else
181254a7Smrg        return decodeImpl!(true, useReplacementDchar)(str, index);
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg/// ditto
181254a7Smrgdchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
*b1e83836Smrgauto ref scope S str, ref size_t index) @trusted pure
181254a7Smrgif (isSomeString!S)
181254a7Smrgin
181254a7Smrg{
181254a7Smrg    assert(index < str.length, "Attempted to decode past the end of a string");
181254a7Smrg}
181254a7Smrgout (result)
181254a7Smrg{
181254a7Smrg    assert(isValidDchar(result));
181254a7Smrg}
*b1e83836Smrgdo
181254a7Smrg{
181254a7Smrg    if (str[index] < codeUnitLimit!S)
181254a7Smrg        return str[index++];
*b1e83836Smrg    else static if (is(immutable S == immutable C[], C))
*b1e83836Smrg        return decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe pure unittest
*b1e83836Smrg{
*b1e83836Smrg    size_t i;
*b1e83836Smrg
*b1e83836Smrg    assert("a".decode(i) == 'a' && i == 1);
*b1e83836Smrg    i = 0;
*b1e83836Smrg    assert("å".decode(i) == 'å' && i == 2);
*b1e83836Smrg    i = 1;
*b1e83836Smrg    assert("aå".decode(i) == 'å' && i == 3);
*b1e83836Smrg    i = 0;
*b1e83836Smrg    assert("å"w.decode(i) == 'å' && i == 1);
*b1e83836Smrg
*b1e83836Smrg    // ë as a multi-code point grapheme
*b1e83836Smrg    i = 0;
*b1e83836Smrg    assert("e\u0308".decode(i) == 'e' && i == 1);
*b1e83836Smrg    // ë as a single code point grapheme
*b1e83836Smrg    i = 0;
*b1e83836Smrg    assert("ë".decode(i) == 'ë' && i == 2);
*b1e83836Smrg    i = 0;
*b1e83836Smrg    assert("ë"w.decode(i) == 'ë' && i == 1);
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrg@safe pure unittest // https://issues.dlang.org/show_bug.cgi?id=22867
*b1e83836Smrg{
*b1e83836Smrg    import std.conv : hexString;
*b1e83836Smrg    string data = hexString!"f787a598";
*b1e83836Smrg    size_t offset = 0;
*b1e83836Smrg    try data.decode(offset);
*b1e83836Smrg    catch (UTFException ex) assert(offset == 0);
181254a7Smrg}
181254a7Smrg
181254a7Smrg/++
*b1e83836Smrg    `decodeFront` is a variant of $(LREF decode) which specifically decodes
*b1e83836Smrg    the first code point. Unlike $(LREF decode), `decodeFront` accepts any
*b1e83836Smrg    $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
*b1e83836Smrg    of code units (rather than just a string or random access
*b1e83836Smrg    range). It also takes the range by `ref` and pops off the elements as it
*b1e83836Smrg    decodes them. If `numCodeUnits` is passed in, it gets set to the number
181254a7Smrg    of code units which were in the code point which was decoded.
181254a7Smrg
181254a7Smrg    Params:
181254a7Smrg        useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
181254a7Smrg        str = input string or indexable Range
181254a7Smrg        numCodeUnits = set to number of code units processed
181254a7Smrg
181254a7Smrg    Returns:
181254a7Smrg        decoded character
181254a7Smrg
181254a7Smrg    Throws:
*b1e83836Smrg        $(LREF UTFException) if `str.front` is not the start of a valid UTF
181254a7Smrg        sequence. If an exception is thrown, then there is no guarantee as to
181254a7Smrg        the number of code units which were popped off, as it depends on the
181254a7Smrg        type of range being used and how many code units had to be popped off
181254a7Smrg        before the code point was determined to be invalid.
181254a7Smrg  +/
181254a7Smrgdchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
181254a7Smrgref S str, out size_t numCodeUnits)
181254a7Smrgif (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S))
181254a7Smrgin
181254a7Smrg{
181254a7Smrg    assert(!str.empty);
181254a7Smrg}
181254a7Smrgout (result)
181254a7Smrg{
181254a7Smrg    assert(isValidDchar(result));
181254a7Smrg}
*b1e83836Smrgdo
181254a7Smrg{
181254a7Smrg    immutable fst = str.front;
181254a7Smrg
181254a7Smrg    if (fst < codeUnitLimit!S)
181254a7Smrg    {
181254a7Smrg        str.popFront();
181254a7Smrg        numCodeUnits = 1;
181254a7Smrg        return fst;
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
*b1e83836Smrg        // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be
*b1e83836Smrg        // done outside of decodeImpl, which is undesirable, since not all
*b1e83836Smrg        // overloads of decodeImpl need it. So, it should be moved back into
*b1e83836Smrg        // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521
*b1e83836Smrg        // has been fixed.
*b1e83836Smrg        enum canIndex = is(S : const char[]) || isRandomAccessRange!S && hasSlicing!S && hasLength!S;
181254a7Smrg        immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits);
181254a7Smrg
181254a7Smrg        // The other range types were already popped by decodeImpl.
181254a7Smrg        static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
181254a7Smrg            str = str[numCodeUnits .. str.length];
181254a7Smrg
181254a7Smrg        return retval;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg/// ditto
181254a7Smrgdchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
*b1e83836Smrgref scope S str, out size_t numCodeUnits) @trusted pure
181254a7Smrgif (isSomeString!S)
181254a7Smrgin
181254a7Smrg{
181254a7Smrg    assert(!str.empty);
181254a7Smrg}
181254a7Smrgout (result)
181254a7Smrg{
181254a7Smrg    assert(isValidDchar(result));
181254a7Smrg}
*b1e83836Smrgdo
181254a7Smrg{
181254a7Smrg    if (str[0] < codeUnitLimit!S)
181254a7Smrg    {
181254a7Smrg        numCodeUnits = 1;
181254a7Smrg        immutable retval = str[0];
181254a7Smrg        str = str[1 .. $];
181254a7Smrg        return retval;
181254a7Smrg    }
*b1e83836Smrg    else static if (is(immutable S == immutable C[], C))
181254a7Smrg    {
*b1e83836Smrg        immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, numCodeUnits);
181254a7Smrg        str = str[numCodeUnits .. $];
181254a7Smrg        return retval;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg/++ Ditto +/
181254a7Smrgdchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
181254a7Smrgif (isInputRange!S && isSomeChar!(ElementType!S))
181254a7Smrg{
181254a7Smrg    size_t numCodeUnits;
181254a7Smrg    return decodeFront!useReplacementDchar(str, numCodeUnits);
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe pure unittest
*b1e83836Smrg{
*b1e83836Smrg    import std.range.primitives;
*b1e83836Smrg    string str = "Hello, World!";
*b1e83836Smrg
*b1e83836Smrg    assert(str.decodeFront == 'H' && str == "ello, World!");
*b1e83836Smrg    str = "å";
*b1e83836Smrg    assert(str.decodeFront == 'å' && str.empty);
*b1e83836Smrg    str = "å";
*b1e83836Smrg    size_t i;
*b1e83836Smrg    assert(str.decodeFront(i) == 'å' && i == 2 && str.empty);
*b1e83836Smrg}
*b1e83836Smrg
181254a7Smrg/++
*b1e83836Smrg    `decodeBack` is a variant of $(LREF decode) which specifically decodes
*b1e83836Smrg    the last code point. Unlike $(LREF decode), `decodeBack` accepts any
181254a7Smrg    bidirectional range of code units (rather than just a string or random access
*b1e83836Smrg    range). It also takes the range by `ref` and pops off the elements as it
*b1e83836Smrg    decodes them. If `numCodeUnits` is passed in, it gets set to the number
181254a7Smrg    of code units which were in the code point which was decoded.
181254a7Smrg
181254a7Smrg    Params:
181254a7Smrg        useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing
181254a7Smrg        str = input string or bidirectional Range
181254a7Smrg        numCodeUnits = gives the number of code units processed
181254a7Smrg
181254a7Smrg    Returns:
181254a7Smrg        A decoded UTF character.
181254a7Smrg
181254a7Smrg    Throws:
*b1e83836Smrg        $(LREF UTFException) if `str.back` is not the end of a valid UTF
*b1e83836Smrg        sequence. If an exception is thrown, the `str` itself remains unchanged,
*b1e83836Smrg        but there is no guarantee as to the value of `numCodeUnits` (when passed).
181254a7Smrg  +/
181254a7Smrgdchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
181254a7Smrg    ref S str, out size_t numCodeUnits)
181254a7Smrgif (isSomeString!S)
181254a7Smrgin
181254a7Smrg{
181254a7Smrg    assert(!str.empty);
181254a7Smrg}
181254a7Smrgout (result)
181254a7Smrg{
181254a7Smrg    assert(isValidDchar(result));
181254a7Smrg}
*b1e83836Smrgdo
181254a7Smrg{
181254a7Smrg    if (str[$ - 1] < codeUnitLimit!S)
181254a7Smrg    {
181254a7Smrg        numCodeUnits = 1;
181254a7Smrg        immutable retval = str[$ - 1];
181254a7Smrg        str = str[0 .. $ - 1];
181254a7Smrg        return retval;
181254a7Smrg    }
*b1e83836Smrg    else static if (is(immutable S == immutable C[], C))
181254a7Smrg    {
181254a7Smrg        numCodeUnits = strideBack(str);
181254a7Smrg        immutable newLength = str.length - numCodeUnits;
181254a7Smrg        size_t index = newLength;
*b1e83836Smrg        immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index);
181254a7Smrg        str = str[0 .. newLength];
181254a7Smrg        return retval;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg/++ Ditto +/
181254a7Smrgdchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
181254a7Smrg    ref S str, out size_t numCodeUnits)
181254a7Smrgif (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S
181254a7Smrg    && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S))
181254a7Smrgin
181254a7Smrg{
181254a7Smrg    assert(!str.empty);
181254a7Smrg}
181254a7Smrgout (result)
181254a7Smrg{
181254a7Smrg    assert(isValidDchar(result));
181254a7Smrg}
*b1e83836Smrgdo
181254a7Smrg{
181254a7Smrg    if (str.back < codeUnitLimit!S)
181254a7Smrg    {
181254a7Smrg        numCodeUnits = 1;
181254a7Smrg        immutable retval = str.back;
181254a7Smrg        str.popBack();
181254a7Smrg        return retval;
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        numCodeUnits = strideBack(str);
181254a7Smrg        static if (isRandomAccessRange!S)
181254a7Smrg        {
181254a7Smrg            size_t index = str.length - numCodeUnits;
181254a7Smrg            immutable retval = decodeImpl!(true, useReplacementDchar)(str, index);
181254a7Smrg            str.popBackExactly(numCodeUnits);
181254a7Smrg            return retval;
181254a7Smrg        }
181254a7Smrg        else
181254a7Smrg        {
181254a7Smrg            alias Char = Unqual!(ElementType!S);
181254a7Smrg            Char[4] codeUnits;
181254a7Smrg            S tmp = str.save;
181254a7Smrg            for (size_t i = numCodeUnits; i > 0; )
181254a7Smrg            {
181254a7Smrg                codeUnits[--i] = tmp.back;
181254a7Smrg                tmp.popBack();
181254a7Smrg            }
181254a7Smrg            const Char[] codePoint = codeUnits[0 .. numCodeUnits];
181254a7Smrg            size_t index = 0;
181254a7Smrg            immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index);
181254a7Smrg            str = tmp;
181254a7Smrg            return retval;
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg/++ Ditto +/
181254a7Smrgdchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
181254a7Smrgif (isSomeString!S
181254a7Smrg    || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S))
181254a7Smrg    || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S)))
181254a7Smrgin
181254a7Smrg{
181254a7Smrg    assert(!str.empty);
181254a7Smrg}
181254a7Smrgout (result)
181254a7Smrg{
181254a7Smrg    assert(isValidDchar(result));
181254a7Smrg}
*b1e83836Smrgdo
181254a7Smrg{
181254a7Smrg    size_t numCodeUnits;
181254a7Smrg    return decodeBack!useReplacementDchar(str, numCodeUnits);
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg///
*b1e83836Smrg@system pure unittest
*b1e83836Smrg{
*b1e83836Smrg    import std.range.primitives;
*b1e83836Smrg    string str = "Hello, World!";
*b1e83836Smrg
*b1e83836Smrg    assert(str.decodeBack == '!' && str == "Hello, World");
*b1e83836Smrg    str = "å";
*b1e83836Smrg    assert(str.decodeBack == 'å' && str.empty);
*b1e83836Smrg    str = "å";
*b1e83836Smrg    size_t i;
*b1e83836Smrg    assert(str.decodeBack(i) == 'å' && i == 2 && str.empty);
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrg// For the given range, code unit values less than this
*b1e83836Smrg// are guaranteed to be valid single-codepoint encodings.
181254a7Smrgpackage template codeUnitLimit(S)
181254a7Smrgif (isSomeChar!(ElementEncodingType!S))
181254a7Smrg{
*b1e83836Smrg    static if (is(immutable ElementEncodingType!S == immutable char))
181254a7Smrg        enum char codeUnitLimit = 0x80;
*b1e83836Smrg    else static if (is(immutable ElementEncodingType!S == immutable wchar))
181254a7Smrg        enum wchar codeUnitLimit = 0xD800;
181254a7Smrg    else
181254a7Smrg        enum dchar codeUnitLimit = 0xD800;
181254a7Smrg}
181254a7Smrg
181254a7Smrg/*
181254a7Smrg * For strings, this function does its own bounds checking to give a
181254a7Smrg * more useful error message when attempting to decode past the end of a string.
181254a7Smrg * Subsequently it uses a pointer instead of an array to avoid
181254a7Smrg * redundant bounds checking.
181254a7Smrg *
181254a7Smrg * The three overloads of this operate on chars, wchars, and dchars.
181254a7Smrg *
181254a7Smrg * Params:
181254a7Smrg *      canIndex = if S is indexable
181254a7Smrg *      useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing
181254a7Smrg *      str = input string or Range
181254a7Smrg *      index = starting index into s[]; incremented by number of code units processed
181254a7Smrg *
181254a7Smrg * Returns:
181254a7Smrg *      decoded character
181254a7Smrg */
181254a7Smrgprivate dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
181254a7Smrg    auto ref S str, ref size_t index)
181254a7Smrgif (
*b1e83836Smrg    is(S : const char[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable char)))
181254a7Smrg{
181254a7Smrg    /* The following encodings are valid, except for the 5 and 6 byte
181254a7Smrg     * combinations:
181254a7Smrg     *  0xxxxxxx
181254a7Smrg     *  110xxxxx 10xxxxxx
181254a7Smrg     *  1110xxxx 10xxxxxx 10xxxxxx
181254a7Smrg     *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
181254a7Smrg     *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
181254a7Smrg     *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
181254a7Smrg     */
181254a7Smrg
181254a7Smrg    /* Dchar bitmask for different numbers of UTF-8 code units.
181254a7Smrg     */
181254a7Smrg    alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
181254a7Smrg
181254a7Smrg    static if (is(S : const char[]))
181254a7Smrg        auto pstr = str.ptr + index;    // this is what makes decodeImpl() @system code
181254a7Smrg    else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
181254a7Smrg        auto pstr = str[index .. str.length];
181254a7Smrg    else
181254a7Smrg        alias pstr = str;
181254a7Smrg
*b1e83836Smrg    // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
*b1e83836Smrg    // outside of decodeImpl
181254a7Smrg    //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
181254a7Smrg
181254a7Smrg    static if (canIndex)
181254a7Smrg    {
181254a7Smrg        immutable length = str.length - index;
181254a7Smrg        ubyte fst = pstr[0];
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        ubyte fst = pstr.front;
181254a7Smrg        pstr.popFront();
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    static if (!useReplacementDchar)
181254a7Smrg    {
181254a7Smrg        static if (canIndex)
181254a7Smrg        {
181254a7Smrg            static UTFException exception(S)(S str, string msg)
181254a7Smrg            {
181254a7Smrg                uint[4] sequence = void;
181254a7Smrg                size_t i;
181254a7Smrg
181254a7Smrg                do
181254a7Smrg                {
181254a7Smrg                    sequence[i] = str[i];
181254a7Smrg                } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
181254a7Smrg
181254a7Smrg                return new UTFException(msg, i).setSequence(sequence[0 .. i]);
181254a7Smrg            }
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        UTFException invalidUTF()
181254a7Smrg        {
181254a7Smrg            static if (canIndex)
181254a7Smrg               return exception(pstr[0 .. length], "Invalid UTF-8 sequence");
181254a7Smrg            else
181254a7Smrg            {
181254a7Smrg                //We can't include the invalid sequence with input strings without
181254a7Smrg                //saving each of the code units along the way, and we can't do it with
181254a7Smrg                //forward ranges without saving the entire range. Both would incur a
181254a7Smrg                //cost for the decoding of every character just to provide a better
181254a7Smrg                //error message for the (hopefully) rare case when an invalid UTF-8
181254a7Smrg                //sequence is encountered, so we don't bother trying to include the
181254a7Smrg                //invalid sequence here, unlike with strings and sliceable ranges.
181254a7Smrg               return new UTFException("Invalid UTF-8 sequence");
181254a7Smrg            }
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        UTFException outOfBounds()
181254a7Smrg        {
181254a7Smrg            static if (canIndex)
181254a7Smrg               return exception(pstr[0 .. length], "Attempted to decode past the end of a string");
181254a7Smrg            else
181254a7Smrg               return new UTFException("Attempted to decode past the end of a string");
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    if ((fst & 0b1100_0000) != 0b1100_0000)
181254a7Smrg    {
181254a7Smrg        static if (useReplacementDchar)
181254a7Smrg        {
181254a7Smrg            ++index;            // always consume bad input to avoid infinite loops
181254a7Smrg            return replacementDchar;
181254a7Smrg        }
181254a7Smrg        else
181254a7Smrg            throw invalidUTF(); // starter must have at least 2 first bits set
181254a7Smrg    }
181254a7Smrg    ubyte tmp = void;
181254a7Smrg    dchar d = fst; // upper control bits are masked out later
181254a7Smrg    fst <<= 1;
181254a7Smrg
181254a7Smrg    foreach (i; AliasSeq!(1, 2, 3))
181254a7Smrg    {
181254a7Smrg
181254a7Smrg        static if (canIndex)
181254a7Smrg        {
181254a7Smrg            if (i == length)
181254a7Smrg            {
181254a7Smrg                static if (useReplacementDchar)
181254a7Smrg                {
181254a7Smrg                    index += i;
181254a7Smrg                    return replacementDchar;
181254a7Smrg                }
181254a7Smrg                else
181254a7Smrg                    throw outOfBounds();
181254a7Smrg            }
181254a7Smrg        }
181254a7Smrg        else
181254a7Smrg        {
181254a7Smrg            if (pstr.empty)
181254a7Smrg            {
181254a7Smrg                static if (useReplacementDchar)
181254a7Smrg                {
181254a7Smrg                    index += i;
181254a7Smrg                    return replacementDchar;
181254a7Smrg                }
181254a7Smrg                else
181254a7Smrg                    throw outOfBounds();
181254a7Smrg            }
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        static if (canIndex)
181254a7Smrg            tmp = pstr[i];
181254a7Smrg        else
181254a7Smrg        {
181254a7Smrg            tmp = pstr.front;
181254a7Smrg            pstr.popFront();
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        if ((tmp & 0xC0) != 0x80)
181254a7Smrg        {
181254a7Smrg            static if (useReplacementDchar)
181254a7Smrg            {
181254a7Smrg                index += i + 1;
181254a7Smrg                return replacementDchar;
181254a7Smrg            }
181254a7Smrg            else
181254a7Smrg                throw invalidUTF();
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        d = (d << 6) | (tmp & 0x3F);
181254a7Smrg        fst <<= 1;
181254a7Smrg
181254a7Smrg        if (!(fst & 0x80)) // no more bytes
181254a7Smrg        {
181254a7Smrg            d &= bitMask[i]; // mask out control bits
181254a7Smrg
181254a7Smrg            // overlong, could have been encoded with i bytes
181254a7Smrg            if ((d & ~bitMask[i - 1]) == 0)
181254a7Smrg            {
181254a7Smrg                static if (useReplacementDchar)
181254a7Smrg                {
181254a7Smrg                    index += i + 1;
181254a7Smrg                    return replacementDchar;
181254a7Smrg                }
181254a7Smrg                else
181254a7Smrg                    throw invalidUTF();
181254a7Smrg            }
181254a7Smrg
181254a7Smrg            // check for surrogates only needed for 3 bytes
181254a7Smrg            static if (i == 2)
181254a7Smrg            {
181254a7Smrg                if (!isValidDchar(d))
181254a7Smrg                {
181254a7Smrg                    static if (useReplacementDchar)
181254a7Smrg                    {
181254a7Smrg                        index += i + 1;
181254a7Smrg                        return replacementDchar;
181254a7Smrg                    }
181254a7Smrg                    else
181254a7Smrg                        throw invalidUTF();
181254a7Smrg                }
181254a7Smrg            }
181254a7Smrg
181254a7Smrg            static if (i == 3)
181254a7Smrg            {
181254a7Smrg                if (d > dchar.max)
181254a7Smrg                {
181254a7Smrg                    static if (useReplacementDchar)
181254a7Smrg                        d = replacementDchar;
181254a7Smrg                    else
181254a7Smrg                        throw invalidUTF();
181254a7Smrg                }
181254a7Smrg            }
*b1e83836Smrg
*b1e83836Smrg            index += i + 1;
181254a7Smrg            return d;
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    static if (useReplacementDchar)
181254a7Smrg    {
181254a7Smrg        index += 4;             // read 4 chars by now
181254a7Smrg        return replacementDchar;
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg        throw invalidUTF();
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe pure @nogc nothrow
181254a7Smrgunittest
181254a7Smrg{
181254a7Smrg    // Add tests for useReplacemendDchar == yes path
181254a7Smrg
181254a7Smrg    static struct R
181254a7Smrg    {
181254a7Smrg      @safe pure @nogc nothrow:
181254a7Smrg        this(string s) { this.s = s; }
181254a7Smrg        @property bool empty() { return idx == s.length; }
181254a7Smrg        @property char front() { return s[idx]; }
181254a7Smrg        void popFront() { ++idx; }
181254a7Smrg        size_t idx;
181254a7Smrg        string s;
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    foreach (s; invalidUTFstrings!char())
181254a7Smrg    {
181254a7Smrg        auto r = R(s);
181254a7Smrg        size_t index;
181254a7Smrg        dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
181254a7Smrg        assert(dc == replacementDchar);
181254a7Smrg        assert(1 <= index && index <= s.length);
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrgprivate dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)
181254a7Smrg(auto ref S str, ref size_t index)
*b1e83836Smrgif (is(S : const wchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable wchar)))
181254a7Smrg{
181254a7Smrg    static if (is(S : const wchar[]))
181254a7Smrg        auto pstr = str.ptr + index;
181254a7Smrg    else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S)
181254a7Smrg        auto pstr = str[index .. str.length];
181254a7Smrg    else
181254a7Smrg        alias pstr = str;
181254a7Smrg
*b1e83836Smrg    // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done
*b1e83836Smrg    // outside of decodeImpl
181254a7Smrg    //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S);
181254a7Smrg
181254a7Smrg    static if (canIndex)
181254a7Smrg    {
181254a7Smrg        immutable length = str.length - index;
181254a7Smrg        uint u = pstr[0];
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        uint u = pstr.front;
181254a7Smrg        pstr.popFront();
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    static if (!useReplacementDchar)
181254a7Smrg    {
181254a7Smrg        UTFException exception(string msg)
181254a7Smrg        {
181254a7Smrg            static if (canIndex)
181254a7Smrg                return new UTFException(msg).setSequence(pstr[0]);
181254a7Smrg            else
181254a7Smrg                return new UTFException(msg);
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    // The < case must be taken care of before decodeImpl is called.
181254a7Smrg    assert(u >= 0xD800);
181254a7Smrg
181254a7Smrg    if (u <= 0xDBFF)
181254a7Smrg    {
181254a7Smrg        static if (canIndex)
181254a7Smrg            immutable onlyOneCodeUnit = length == 1;
181254a7Smrg        else
181254a7Smrg            immutable onlyOneCodeUnit = pstr.empty;
181254a7Smrg
181254a7Smrg        if (onlyOneCodeUnit)
181254a7Smrg        {
181254a7Smrg            static if (useReplacementDchar)
181254a7Smrg            {
181254a7Smrg                ++index;
181254a7Smrg                return replacementDchar;
181254a7Smrg            }
181254a7Smrg            else
181254a7Smrg                throw exception("surrogate UTF-16 high value past end of string");
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        static if (canIndex)
181254a7Smrg            immutable uint u2 = pstr[1];
181254a7Smrg        else
181254a7Smrg        {
181254a7Smrg            immutable uint u2 = pstr.front;
181254a7Smrg            pstr.popFront();
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        if (u2 < 0xDC00 || u2 > 0xDFFF)
181254a7Smrg        {
181254a7Smrg            static if (useReplacementDchar)
181254a7Smrg                u = replacementDchar;
181254a7Smrg            else
181254a7Smrg                throw exception("surrogate UTF-16 low value out of range");
181254a7Smrg        }
181254a7Smrg        else
181254a7Smrg            u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
181254a7Smrg        ++index;
181254a7Smrg    }
181254a7Smrg    else if (u >= 0xDC00 && u <= 0xDFFF)
181254a7Smrg    {
181254a7Smrg        static if (useReplacementDchar)
181254a7Smrg            u = replacementDchar;
181254a7Smrg        else
181254a7Smrg            throw exception("unpaired surrogate UTF-16 value");
181254a7Smrg    }
181254a7Smrg    ++index;
181254a7Smrg
181254a7Smrg    // Note: u+FFFE and u+FFFF are specifically permitted by the
181254a7Smrg    // Unicode standard for application internal use (see isValidDchar)
181254a7Smrg
181254a7Smrg    return cast(dchar) u;
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe pure @nogc nothrow
181254a7Smrgunittest
181254a7Smrg{
181254a7Smrg    // Add tests for useReplacemendDchar == true path
181254a7Smrg
181254a7Smrg    static struct R
181254a7Smrg    {
181254a7Smrg      @safe pure @nogc nothrow:
181254a7Smrg        this(wstring s) { this.s = s; }
181254a7Smrg        @property bool empty() { return idx == s.length; }
181254a7Smrg        @property wchar front() { return s[idx]; }
181254a7Smrg        void popFront() { ++idx; }
181254a7Smrg        size_t idx;
181254a7Smrg        wstring s;
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    foreach (s; invalidUTFstrings!wchar())
181254a7Smrg    {
181254a7Smrg        auto r = R(s);
181254a7Smrg        size_t index;
181254a7Smrg        dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
181254a7Smrg        assert(dc == replacementDchar);
181254a7Smrg        assert(1 <= index && index <= s.length);
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrgprivate dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
181254a7Smrg    auto ref S str, ref size_t index)
*b1e83836Smrgif (is(S : const dchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar)))
181254a7Smrg{
181254a7Smrg    static if (is(S : const dchar[]))
181254a7Smrg        auto pstr = str.ptr;
181254a7Smrg    else
181254a7Smrg        alias pstr = str;
181254a7Smrg
181254a7Smrg    static if (is(S : const dchar[]) || isRandomAccessRange!S)
181254a7Smrg    {
181254a7Smrg        dchar dc = pstr[index];
181254a7Smrg        if (!isValidDchar(dc))
181254a7Smrg        {
181254a7Smrg            static if (useReplacementDchar)
181254a7Smrg                dc = replacementDchar;
181254a7Smrg            else
181254a7Smrg                throw new UTFException("Invalid UTF-32 value").setSequence(dc);
181254a7Smrg        }
181254a7Smrg        ++index;
181254a7Smrg        return dc;
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        dchar dc = pstr.front;
181254a7Smrg        if (!isValidDchar(dc))
181254a7Smrg        {
181254a7Smrg            static if (useReplacementDchar)
181254a7Smrg                dc = replacementDchar;
181254a7Smrg            else
181254a7Smrg                throw new UTFException("Invalid UTF-32 value").setSequence(dc);
181254a7Smrg        }
181254a7Smrg        ++index;
181254a7Smrg        pstr.popFront();
181254a7Smrg        return dc;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe pure @nogc nothrow
181254a7Smrgunittest
181254a7Smrg{
181254a7Smrg    // Add tests for useReplacemendDchar == true path
181254a7Smrg
181254a7Smrg    static struct R
181254a7Smrg    {
181254a7Smrg      @safe pure @nogc nothrow:
181254a7Smrg        this(dstring s) { this.s = s; }
181254a7Smrg        @property bool empty() { return idx == s.length; }
181254a7Smrg        @property dchar front() { return s[idx]; }
181254a7Smrg        void popFront() { ++idx; }
181254a7Smrg        size_t idx;
181254a7Smrg        dstring s;
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    foreach (s; invalidUTFstrings!dchar())
181254a7Smrg    {
181254a7Smrg        auto r = R(s);
181254a7Smrg        size_t index;
181254a7Smrg        dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index);
181254a7Smrg        assert(dc == replacementDchar);
181254a7Smrg        assert(1 <= index && index <= s.length);
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg
*b1e83836Smrgversion (StdUnittest) private void testDecode(R)(R range,
181254a7Smrg                                             size_t index,
181254a7Smrg                                             dchar expectedChar,
181254a7Smrg                                             size_t expectedIndex,
181254a7Smrg                                             size_t line = __LINE__)
181254a7Smrg{
181254a7Smrg    import core.exception : AssertError;
*b1e83836Smrg    import std.exception : enforce;
181254a7Smrg    import std.string : format;
*b1e83836Smrg    import std.traits : isNarrowString;
181254a7Smrg
181254a7Smrg    static if (hasLength!R)
181254a7Smrg        immutable lenBefore = range.length;
181254a7Smrg
*b1e83836Smrg    static if (isRandomAccessRange!R && !isNarrowString!R)
181254a7Smrg    {
181254a7Smrg        {
181254a7Smrg            immutable result = decode(range, index);
181254a7Smrg            enforce(result == expectedChar,
181254a7Smrg                    new AssertError(format("decode: Wrong character: %s", result), __FILE__, line));
181254a7Smrg            enforce(index == expectedIndex,
181254a7Smrg                    new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
181254a7Smrg            static if (hasLength!R)
181254a7Smrg            {
181254a7Smrg                enforce(range.length == lenBefore,
181254a7Smrg                        new AssertError(format("decode: length changed: %s", range.length), __FILE__, line));
181254a7Smrg            }
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
*b1e83836Smrgversion (StdUnittest) private void testDecodeFront(R)(ref R range,
181254a7Smrg                                                  dchar expectedChar,
181254a7Smrg                                                  size_t expectedNumCodeUnits,
181254a7Smrg                                                  size_t line = __LINE__)
181254a7Smrg{
181254a7Smrg    import core.exception : AssertError;
*b1e83836Smrg    import std.exception : enforce;
181254a7Smrg    import std.string : format;
181254a7Smrg
181254a7Smrg    static if (hasLength!R)
181254a7Smrg        immutable lenBefore = range.length;
181254a7Smrg
181254a7Smrg    size_t numCodeUnits;
181254a7Smrg    immutable result = decodeFront(range, numCodeUnits);
181254a7Smrg    enforce(result == expectedChar,
181254a7Smrg            new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line));
181254a7Smrg    enforce(numCodeUnits == expectedNumCodeUnits,
181254a7Smrg            new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
181254a7Smrg
181254a7Smrg    static if (hasLength!R)
181254a7Smrg    {
181254a7Smrg        enforce(range.length == lenBefore - numCodeUnits,
181254a7Smrg                new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line));
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
*b1e83836Smrgversion (StdUnittest) private void testDecodeBack(R)(ref R range,
181254a7Smrg                                                 dchar expectedChar,
181254a7Smrg                                                 size_t expectedNumCodeUnits,
181254a7Smrg                                                 size_t line = __LINE__)
181254a7Smrg{
181254a7Smrg    // This condition is to allow unit testing all `decode` functions together
181254a7Smrg    static if (!isBidirectionalRange!R)
181254a7Smrg        return;
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        import core.exception : AssertError;
*b1e83836Smrg        import std.exception : enforce;
181254a7Smrg        import std.string : format;
181254a7Smrg
181254a7Smrg        static if (hasLength!R)
181254a7Smrg            immutable lenBefore = range.length;
181254a7Smrg
181254a7Smrg        size_t numCodeUnits;
181254a7Smrg        immutable result = decodeBack(range, numCodeUnits);
181254a7Smrg        enforce(result == expectedChar,
181254a7Smrg                new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line));
181254a7Smrg        enforce(numCodeUnits == expectedNumCodeUnits,
181254a7Smrg                new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line));
181254a7Smrg
181254a7Smrg        static if (hasLength!R)
181254a7Smrg        {
181254a7Smrg            enforce(range.length == lenBefore - numCodeUnits,
181254a7Smrg                    new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line));
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
*b1e83836Smrgversion (StdUnittest) private void testAllDecode(R)(R range,
181254a7Smrg                                                dchar expectedChar,
181254a7Smrg                                                size_t expectedIndex,
181254a7Smrg                                                size_t line = __LINE__)
181254a7Smrg{
181254a7Smrg    testDecode(range, 0, expectedChar, expectedIndex, line);
181254a7Smrg    static if (isBidirectionalRange!R)
181254a7Smrg    {
181254a7Smrg        auto rangeCopy = range.save;
181254a7Smrg        testDecodeBack(rangeCopy, expectedChar, expectedIndex, line);
181254a7Smrg    }
181254a7Smrg    testDecodeFront(range, expectedChar, expectedIndex, line);
181254a7Smrg}
181254a7Smrg
*b1e83836Smrgversion (StdUnittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__)
181254a7Smrg{
181254a7Smrg    import core.exception : AssertError;
*b1e83836Smrg    import std.exception : assertThrown, enforce;
181254a7Smrg    import std.string : format;
181254a7Smrg
181254a7Smrg    immutable initialIndex = index;
181254a7Smrg
181254a7Smrg    static if (hasLength!R)
181254a7Smrg        immutable lenBefore = range.length;
181254a7Smrg
181254a7Smrg    static if (isRandomAccessRange!R)
181254a7Smrg    {
181254a7Smrg        assertThrown!UTFException(decode(range, index), null, __FILE__, line);
181254a7Smrg        enforce(index == initialIndex,
181254a7Smrg                new AssertError(format("decode: Wrong index: %s", index), __FILE__, line));
181254a7Smrg        static if (hasLength!R)
181254a7Smrg        {
181254a7Smrg            enforce(range.length == lenBefore,
181254a7Smrg                    new AssertError(format("decode: length changed:", range.length), __FILE__, line));
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    if (initialIndex == 0)
181254a7Smrg        assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line);
181254a7Smrg}
181254a7Smrg
*b1e83836Smrgversion (StdUnittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__)
181254a7Smrg{
181254a7Smrg    // This condition is to allow unit testing all `decode` functions together
181254a7Smrg    static if (!isBidirectionalRange!R)
181254a7Smrg        return;
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        import core.exception : AssertError;
*b1e83836Smrg        import std.exception : assertThrown, enforce;
181254a7Smrg        import std.string : format;
181254a7Smrg
181254a7Smrg        static if (hasLength!R)
181254a7Smrg            immutable lenBefore = range.length;
181254a7Smrg
181254a7Smrg        static if (isRandomAccessRange!R)
181254a7Smrg        {
181254a7Smrg            assertThrown!UTFException(decodeBack(range), null, __FILE__, line);
181254a7Smrg            static if (hasLength!R)
181254a7Smrg            {
181254a7Smrg                enforce(range.length == lenBefore,
181254a7Smrg                        new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line));
181254a7Smrg            }
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg@system unittest
181254a7Smrg{
181254a7Smrg    import std.conv : to;
181254a7Smrg    import std.exception;
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char,
181254a7Smrg                          (string s) => new RefBidirCU!char(s),
181254a7Smrg                          (string s) => new RefRandomCU!char(s)))
181254a7Smrg    {
181254a7Smrg        enum sHasLength = hasLength!(typeof(S("abcd")));
181254a7Smrg
181254a7Smrg        {
181254a7Smrg            auto range = S("abcd");
181254a7Smrg            testDecode(range, 0, 'a', 1);
181254a7Smrg            testDecode(range, 1, 'b', 2);
181254a7Smrg            testDecodeFront(range, 'a', 1);
181254a7Smrg            testDecodeFront(range, 'b', 1);
181254a7Smrg            assert(decodeFront(range) == 'c');
181254a7Smrg            assert(decodeFront(range) == 'd');
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        {
181254a7Smrg            auto range = S("ウェブサイト");
181254a7Smrg            testDecode(range, 0, 'ウ', 3);
181254a7Smrg            testDecode(range, 3, 'ェ', 6);
181254a7Smrg            testDecodeFront(range, 'ウ', 3);
181254a7Smrg            testDecodeFront(range, 'ェ', 3);
181254a7Smrg            assert(decodeFront(range) == 'ブ');
181254a7Smrg            assert(decodeFront(range) == 'サ');
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        {
181254a7Smrg            auto range = S("abcd");
181254a7Smrg            testDecodeBack(range, 'd', 1);
181254a7Smrg            testDecodeBack(range, 'c', 1);
181254a7Smrg            testDecodeBack(range, 'b', 1);
181254a7Smrg            testDecodeBack(range, 'a', 1);
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        {
181254a7Smrg            auto range = S("ウェブサイト");
181254a7Smrg            testDecodeBack(range, 'ト', 3);
181254a7Smrg            testDecodeBack(range, 'イ', 3);
181254a7Smrg            testDecodeBack(range, 'サ', 3);
181254a7Smrg            testDecodeBack(range, 'ブ', 3);
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        testAllDecode(S("\xC2\xA9"), '\u00A9', 2);
181254a7Smrg        testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3);
181254a7Smrg
181254a7Smrg        foreach (str; ["\xE2\x89", // too short
181254a7Smrg                       "\xC0\x8A",
181254a7Smrg                       "\xE0\x80\x8A",
181254a7Smrg                       "\xF0\x80\x80\x8A",
181254a7Smrg                       "\xF8\x80\x80\x80\x8A",
181254a7Smrg                       "\xFC\x80\x80\x80\x80\x8A"])
181254a7Smrg        {
181254a7Smrg            testBadDecode(S(str), 0);
181254a7Smrg            testBadDecode(S(str), 1);
181254a7Smrg            testBadDecodeBack(S(str));
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        //Invalid UTF-8 sequence where the first code unit is valid.
181254a7Smrg        testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
181254a7Smrg        testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
181254a7Smrg
181254a7Smrg        //Invalid UTF-8 sequence where the first code unit isn't valid.
181254a7Smrg        foreach (str; ["\xED\xA0\x80",
181254a7Smrg                       "\xED\xAD\xBF",
181254a7Smrg                       "\xED\xAE\x80",
181254a7Smrg                       "\xED\xAF\xBF",
181254a7Smrg                       "\xED\xB0\x80",
181254a7Smrg                       "\xED\xBE\x80",
181254a7Smrg                       "\xED\xBF\xBF"])
181254a7Smrg        {
181254a7Smrg            testBadDecode(S(str), 0);
181254a7Smrg            testBadDecodeBack(S(str));
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg@system unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
*b1e83836Smrg    foreach (S; AliasSeq!((wstring s) => s, InputCU!wchar, RandomCU!wchar,
181254a7Smrg                          (wstring s) => new RefBidirCU!wchar(s),
181254a7Smrg                          (wstring s) => new RefRandomCU!wchar(s)))
181254a7Smrg    {
181254a7Smrg        testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
181254a7Smrg        testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
181254a7Smrg        testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
181254a7Smrg        testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
181254a7Smrg        testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
181254a7Smrg
181254a7Smrg        testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
181254a7Smrg        testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
181254a7Smrg
181254a7Smrg        testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
181254a7Smrg        testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
181254a7Smrg
181254a7Smrg        {
181254a7Smrg            auto range = S("ウェブサイト");
181254a7Smrg            testDecode(range, 0, 'ウ', 1);
181254a7Smrg            testDecode(range, 1, 'ェ', 2);
181254a7Smrg            testDecodeFront(range, 'ウ', 1);
181254a7Smrg            testDecodeFront(range, 'ェ', 1);
181254a7Smrg            assert(decodeFront(range) == 'ブ');
181254a7Smrg            assert(decodeFront(range) == 'サ');
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        {
181254a7Smrg            auto range = S("ウェブサイト");
181254a7Smrg            testDecodeBack(range, 'ト', 1);
181254a7Smrg            testDecodeBack(range, 'イ', 1);
181254a7Smrg            testDecodeBack(range, 'サ', 1);
181254a7Smrg            testDecodeBack(range, 'ブ', 1);
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
*b1e83836Smrg    foreach (S; AliasSeq!((wchar[] s) => s.idup, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s)))
181254a7Smrg    {
181254a7Smrg        auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
181254a7Smrg                      cast(wchar) 0x1400,
181254a7Smrg                      cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
181254a7Smrg        testDecode(str, 0, cast(dchar) 0x10000, 2);
181254a7Smrg        testDecode(str, 2, cast(dchar) 0x1400, 3);
181254a7Smrg        testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
181254a7Smrg        testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
181254a7Smrg        testDecodeBack(str, cast(dchar) 0x1400, 1);
181254a7Smrg        testDecodeBack(str, cast(dchar) 0x10000, 2);
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg@system unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
*b1e83836Smrg    foreach (S; AliasSeq!((dstring s) => s, RandomCU!dchar, InputCU!dchar,
181254a7Smrg                          (dstring s) => new RefBidirCU!dchar(s),
181254a7Smrg                          (dstring s) => new RefRandomCU!dchar(s)))
181254a7Smrg    {
181254a7Smrg        testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
181254a7Smrg        testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
181254a7Smrg        testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
181254a7Smrg        testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
181254a7Smrg        testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
181254a7Smrg
181254a7Smrg        testBadDecode(S([cast(dchar) 0xD800]), 0);
181254a7Smrg        testBadDecode(S([cast(dchar) 0xDFFE]), 0);
181254a7Smrg        testBadDecode(S([cast(dchar) 0x110000]), 0);
181254a7Smrg
181254a7Smrg        testBadDecodeBack(S([cast(dchar) 0xD800]));
181254a7Smrg        testBadDecodeBack(S([cast(dchar) 0xDFFE]));
181254a7Smrg        testBadDecodeBack(S([cast(dchar) 0x110000]));
181254a7Smrg
181254a7Smrg        {
181254a7Smrg            auto range = S("ウェブサイト");
181254a7Smrg            testDecode(range, 0, 'ウ', 1);
181254a7Smrg            testDecode(range, 1, 'ェ', 2);
181254a7Smrg            testDecodeFront(range, 'ウ', 1);
181254a7Smrg            testDecodeFront(range, 'ェ', 1);
181254a7Smrg            assert(decodeFront(range) == 'ブ');
181254a7Smrg            assert(decodeFront(range) == 'サ');
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        {
181254a7Smrg            auto range = S("ウェブサイト");
181254a7Smrg            testDecodeBack(range, 'ト', 1);
181254a7Smrg            testDecodeBack(range, 'イ', 1);
181254a7Smrg            testDecodeBack(range, 'サ', 1);
181254a7Smrg            testDecodeBack(range, 'ブ', 1);
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
*b1e83836Smrg    foreach (S; AliasSeq!((dchar[] s) => s.idup, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s)))
181254a7Smrg    {
181254a7Smrg        auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
181254a7Smrg        testDecode(str, 0, 0x10000, 1);
181254a7Smrg        testDecode(str, 1, 0x1400, 2);
181254a7Smrg        testDecode(str, 2, 0xB9DDE, 3);
181254a7Smrg        testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
181254a7Smrg        testDecodeBack(str, cast(dchar) 0x1400, 1);
181254a7Smrg        testDecodeBack(str, cast(dchar) 0x10000, 1);
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
*b1e83836Smrg    import std.traits : FunctionAttribute, functionAttributes, isSafe;
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    foreach (S; AliasSeq!( char[], const( char)[],  string,
181254a7Smrg                          wchar[], const(wchar)[], wstring,
181254a7Smrg                          dchar[], const(dchar)[], dstring))
181254a7Smrg    {
181254a7Smrg        static assert(isSafe!({ S str; size_t i = 0; decode(str, i);      }));
181254a7Smrg        static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); }));
181254a7Smrg        static assert(isSafe!({ S str; decodeFront(str); }));
181254a7Smrg        static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0);
181254a7Smrg        static assert((functionAttributes!({
181254a7Smrg            S str; size_t i = 0; decodeFront(str, i);
181254a7Smrg        }) & FunctionAttribute.pure_) != 0);
181254a7Smrg        static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0);
181254a7Smrg        static assert((functionAttributes!({
181254a7Smrg            S str; size_t i = 0; decodeBack(str, i);
181254a7Smrg        }) & FunctionAttribute.pure_) != 0);
181254a7Smrg        static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0);
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg    char[4] val;
181254a7Smrg    val[0] = 0b1111_0111;
181254a7Smrg    val[1] = 0b1011_1111;
181254a7Smrg    val[2] = 0b1011_1111;
181254a7Smrg    val[3] = 0b1011_1111;
181254a7Smrg    size_t i = 0;
181254a7Smrg    assertThrown!UTFException((){ dchar ch = decode(val[], i); }());
181254a7Smrg}
181254a7Smrg/* =================== Encode ======================= */
181254a7Smrg
181254a7Smrgprivate dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c)
181254a7Smrg{
181254a7Smrg    static if (useReplacementDchar)
181254a7Smrg        return replacementDchar;
181254a7Smrg    else
181254a7Smrg        throw new UTFException(msg).setSequence(c);
181254a7Smrg}
181254a7Smrg
181254a7Smrg/++
*b1e83836Smrg    Encodes `c` into the static array, `buf`, and returns the actual
*b1e83836Smrg    length of the encoded character (a number between `1` and `4` for
*b1e83836Smrg    `char[4]` buffers and a number between `1` and `2` for
*b1e83836Smrg    `wchar[2]` buffers).
181254a7Smrg
181254a7Smrg    Throws:
*b1e83836Smrg        `UTFException` if `c` is not a valid UTF code point.
181254a7Smrg  +/
181254a7Smrgsize_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
181254a7Smrg    out char[4] buf, dchar c) @safe pure
181254a7Smrg{
181254a7Smrg    if (c <= 0x7F)
181254a7Smrg    {
181254a7Smrg        assert(isValidDchar(c));
181254a7Smrg        buf[0] = cast(char) c;
181254a7Smrg        return 1;
181254a7Smrg    }
181254a7Smrg    if (c <= 0x7FF)
181254a7Smrg    {
181254a7Smrg        assert(isValidDchar(c));
181254a7Smrg        buf[0] = cast(char)(0xC0 | (c >> 6));
181254a7Smrg        buf[1] = cast(char)(0x80 | (c & 0x3F));
181254a7Smrg        return 2;
181254a7Smrg    }
181254a7Smrg    if (c <= 0xFFFF)
181254a7Smrg    {
181254a7Smrg        if (0xD800 <= c && c <= 0xDFFF)
181254a7Smrg            c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
181254a7Smrg
181254a7Smrg        assert(isValidDchar(c));
181254a7Smrg    L3:
181254a7Smrg        buf[0] = cast(char)(0xE0 | (c >> 12));
181254a7Smrg        buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
181254a7Smrg        buf[2] = cast(char)(0x80 | (c & 0x3F));
181254a7Smrg        return 3;
181254a7Smrg    }
181254a7Smrg    if (c <= 0x10FFFF)
181254a7Smrg    {
181254a7Smrg        assert(isValidDchar(c));
181254a7Smrg        buf[0] = cast(char)(0xF0 | (c >> 18));
181254a7Smrg        buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
181254a7Smrg        buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
181254a7Smrg        buf[3] = cast(char)(0x80 | (c & 0x3F));
181254a7Smrg        return 4;
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    assert(!isValidDchar(c));
181254a7Smrg    c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
181254a7Smrg    goto L3;
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe unittest
*b1e83836Smrg{
*b1e83836Smrg    import std.exception : assertThrown;
*b1e83836Smrg    import std.typecons : Yes;
*b1e83836Smrg
*b1e83836Smrg    char[4] buf;
*b1e83836Smrg
*b1e83836Smrg    assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
*b1e83836Smrg    assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
*b1e83836Smrg    assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
*b1e83836Smrg    assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
*b1e83836Smrg    assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
*b1e83836Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
*b1e83836Smrg
*b1e83836Smrg    encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
*b1e83836Smrg    auto slice = buf[];
*b1e83836Smrg    assert(slice.decodeFront == replacementDchar);
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe unittest
*b1e83836Smrg{
*b1e83836Smrg    import std.exception : assertThrown;
*b1e83836Smrg    import std.typecons : Yes;
*b1e83836Smrg
*b1e83836Smrg    wchar[2] buf;
*b1e83836Smrg
*b1e83836Smrg    assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
*b1e83836Smrg    assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
*b1e83836Smrg    assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
*b1e83836Smrg    assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
*b1e83836Smrg    assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
*b1e83836Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
*b1e83836Smrg
*b1e83836Smrg    encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
*b1e83836Smrg    auto slice = buf[];
*b1e83836Smrg    assert(slice.decodeFront == replacementDchar);
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe unittest
*b1e83836Smrg{
*b1e83836Smrg    import std.exception : assertThrown;
*b1e83836Smrg    import std.typecons : Yes;
*b1e83836Smrg
*b1e83836Smrg    dchar[1] buf;
*b1e83836Smrg
*b1e83836Smrg    assert(encode(buf, '\u0000') == 1 && buf[0] == '\u0000');
*b1e83836Smrg    assert(encode(buf, '\uD7FF') == 1 && buf[0] == '\uD7FF');
*b1e83836Smrg    assert(encode(buf, '\uE000') == 1 && buf[0] == '\uE000');
*b1e83836Smrg    assert(encode(buf, '\U0010FFFF') == 1 && buf[0] == '\U0010FFFF');
*b1e83836Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
*b1e83836Smrg
*b1e83836Smrg    encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
*b1e83836Smrg    assert(buf[0] == replacementDchar);
*b1e83836Smrg}
*b1e83836Smrg
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    char[4] buf;
181254a7Smrg
181254a7Smrg    assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
181254a7Smrg    assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
181254a7Smrg    assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
181254a7Smrg    assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
181254a7Smrg    assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
181254a7Smrg    assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
181254a7Smrg    assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
181254a7Smrg    assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
181254a7Smrg    assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
181254a7Smrg    assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
181254a7Smrg    assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");
181254a7Smrg
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
181254a7Smrg
181254a7Smrg    assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
*b1e83836Smrg    enum replacementDcharString = "\uFFFD";
*b1e83836Smrg    assert(buf[0 .. replacementDcharString.length] == replacementDcharString);
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrgsize_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
181254a7Smrg    out wchar[2] buf, dchar c) @safe pure
181254a7Smrg{
181254a7Smrg    if (c <= 0xFFFF)
181254a7Smrg    {
181254a7Smrg        if (0xD800 <= c && c <= 0xDFFF)
181254a7Smrg            c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
181254a7Smrg
181254a7Smrg        assert(isValidDchar(c));
181254a7Smrg    L1:
181254a7Smrg        buf[0] = cast(wchar) c;
181254a7Smrg        return 1;
181254a7Smrg    }
181254a7Smrg    if (c <= 0x10FFFF)
181254a7Smrg    {
181254a7Smrg        assert(isValidDchar(c));
181254a7Smrg        buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
181254a7Smrg        buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
181254a7Smrg        return 2;
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
181254a7Smrg    goto L1;
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    wchar[2] buf;
181254a7Smrg
181254a7Smrg    assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
181254a7Smrg    assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
181254a7Smrg    assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
181254a7Smrg    assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
181254a7Smrg    assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
181254a7Smrg    assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
181254a7Smrg    assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
181254a7Smrg
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
181254a7Smrg
181254a7Smrg    assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
181254a7Smrg    assert(buf.front == replacementDchar);
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrgsize_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
181254a7Smrg    out dchar[1] buf, dchar c) @safe pure
181254a7Smrg{
181254a7Smrg    if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
181254a7Smrg        c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
181254a7Smrg    else
181254a7Smrg        assert(isValidDchar(c));
181254a7Smrg    buf[0] = c;
181254a7Smrg    return 1;
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    dchar[1] buf;
181254a7Smrg
181254a7Smrg    encode(buf, '\u0000'); assert(buf[0] == '\u0000');
181254a7Smrg    encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF');
181254a7Smrg    encode(buf, '\uE000'); assert(buf[0] == '\uE000');
181254a7Smrg    encode(buf, 0xFFFE ); assert(buf[0] == 0xFFFE);
181254a7Smrg    encode(buf, 0xFFFF ); assert(buf[0] == 0xFFFF);
181254a7Smrg    encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF');
181254a7Smrg
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
181254a7Smrg
181254a7Smrg    assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
181254a7Smrg    assert(buf.front == replacementDchar);
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/++
*b1e83836Smrg    Encodes `c` in `str`'s encoding and appends it to `str`.
181254a7Smrg
181254a7Smrg    Throws:
*b1e83836Smrg        `UTFException` if `c` is not a valid UTF code point.
181254a7Smrg  +/
181254a7Smrgvoid encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
*b1e83836Smrg    ref scope char[] str, dchar c) @safe pure
181254a7Smrg{
181254a7Smrg    if (c <= 0x7F)
181254a7Smrg    {
181254a7Smrg        assert(isValidDchar(c));
*b1e83836Smrg        str ~= cast(char) c;
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        char[4] buf;
181254a7Smrg        uint L;
181254a7Smrg
181254a7Smrg        if (c <= 0x7FF)
181254a7Smrg        {
181254a7Smrg            assert(isValidDchar(c));
181254a7Smrg            buf[0] = cast(char)(0xC0 | (c >> 6));
181254a7Smrg            buf[1] = cast(char)(0x80 | (c & 0x3F));
181254a7Smrg            L = 2;
181254a7Smrg        }
181254a7Smrg        else if (c <= 0xFFFF)
181254a7Smrg        {
181254a7Smrg            if (0xD800 <= c && c <= 0xDFFF)
181254a7Smrg                c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
181254a7Smrg
181254a7Smrg            assert(isValidDchar(c));
181254a7Smrg        L3:
181254a7Smrg            buf[0] = cast(char)(0xE0 | (c >> 12));
181254a7Smrg            buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
181254a7Smrg            buf[2] = cast(char)(0x80 | (c & 0x3F));
181254a7Smrg            L = 3;
181254a7Smrg        }
181254a7Smrg        else if (c <= 0x10FFFF)
181254a7Smrg        {
181254a7Smrg            assert(isValidDchar(c));
181254a7Smrg            buf[0] = cast(char)(0xF0 | (c >> 18));
181254a7Smrg            buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
181254a7Smrg            buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
181254a7Smrg            buf[3] = cast(char)(0x80 | (c & 0x3F));
181254a7Smrg            L = 4;
181254a7Smrg        }
181254a7Smrg        else
181254a7Smrg        {
181254a7Smrg            assert(!isValidDchar(c));
181254a7Smrg            c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
181254a7Smrg            goto L3;
181254a7Smrg        }
*b1e83836Smrg        str ~= buf[0 .. L];
181254a7Smrg    }
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe unittest
*b1e83836Smrg{
*b1e83836Smrg    char[] s = "abcd".dup;
*b1e83836Smrg    dchar d1 = 'a';
*b1e83836Smrg    dchar d2 = 'ø';
*b1e83836Smrg
*b1e83836Smrg    encode(s, d1);
*b1e83836Smrg    assert(s.length == 5);
*b1e83836Smrg    assert(s == "abcda");
*b1e83836Smrg    encode(s, d2);
*b1e83836Smrg    assert(s.length == 7);
*b1e83836Smrg    assert(s == "abcdaø");
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    char[] s = "abcd".dup;
181254a7Smrg    encode(s, cast(dchar)'a');
181254a7Smrg    assert(s.length == 5);
181254a7Smrg    assert(s == "abcda");
181254a7Smrg
181254a7Smrg    encode(s, cast(dchar)'\u00A9');
181254a7Smrg    assert(s.length == 7);
181254a7Smrg    assert(s == "abcda\xC2\xA9");
181254a7Smrg    //assert(s == "abcda\u00A9");   // BUG: fix compiler
181254a7Smrg
181254a7Smrg    encode(s, cast(dchar)'\u2260');
181254a7Smrg    assert(s.length == 10);
181254a7Smrg    assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    char[] buf;
181254a7Smrg
181254a7Smrg    encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
181254a7Smrg    encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
181254a7Smrg    encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
181254a7Smrg    encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
181254a7Smrg    encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
181254a7Smrg    encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
181254a7Smrg    encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
181254a7Smrg    encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
181254a7Smrg    encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
181254a7Smrg    encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
181254a7Smrg    encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
181254a7Smrg
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
181254a7Smrg
*b1e83836Smrg    enum replacementDcharString = "\uFFFD";
*b1e83836Smrg    enum rdcslen = replacementDcharString.length;
*b1e83836Smrg    assert(buf[$ - rdcslen .. $] != replacementDcharString);
181254a7Smrg    encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
*b1e83836Smrg    assert(buf[$ - rdcslen .. $] == replacementDcharString);
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg/// ditto
181254a7Smrgvoid encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
*b1e83836Smrg    ref scope wchar[] str, dchar c) @safe pure
181254a7Smrg{
181254a7Smrg    if (c <= 0xFFFF)
181254a7Smrg    {
181254a7Smrg        if (0xD800 <= c && c <= 0xDFFF)
181254a7Smrg            c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
181254a7Smrg
181254a7Smrg        assert(isValidDchar(c));
181254a7Smrg    L1:
*b1e83836Smrg        str ~= cast(wchar) c;
181254a7Smrg    }
181254a7Smrg    else if (c <= 0x10FFFF)
181254a7Smrg    {
181254a7Smrg        wchar[2] buf;
181254a7Smrg
181254a7Smrg        assert(isValidDchar(c));
181254a7Smrg        buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
181254a7Smrg        buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
*b1e83836Smrg        str ~= buf;
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        assert(!isValidDchar(c));
181254a7Smrg        c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
181254a7Smrg        goto L1;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    wchar[] buf;
181254a7Smrg
181254a7Smrg    encode(buf, '\u0000'); assert(buf[0] == '\u0000');
181254a7Smrg    encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
181254a7Smrg    encode(buf, '\uE000'); assert(buf[2] == '\uE000');
181254a7Smrg    encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
181254a7Smrg    encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
181254a7Smrg    encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
181254a7Smrg    encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
181254a7Smrg
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
181254a7Smrg
181254a7Smrg    assert(buf.back != replacementDchar);
181254a7Smrg    encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
181254a7Smrg    assert(buf.back == replacementDchar);
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg/// ditto
181254a7Smrgvoid encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
*b1e83836Smrg    ref scope dchar[] str, dchar c) @safe pure
181254a7Smrg{
181254a7Smrg    if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
181254a7Smrg        c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
181254a7Smrg    else
181254a7Smrg        assert(isValidDchar(c));
181254a7Smrg    str ~= c;
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    dchar[] buf;
181254a7Smrg
181254a7Smrg    encode(buf, '\u0000'); assert(buf[0] == '\u0000');
181254a7Smrg    encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
181254a7Smrg    encode(buf, '\uE000'); assert(buf[2] == '\uE000');
181254a7Smrg    encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE);
181254a7Smrg    encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF);
181254a7Smrg    encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
181254a7Smrg
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
181254a7Smrg    assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
181254a7Smrg
181254a7Smrg    assert(buf.back != replacementDchar);
181254a7Smrg    encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
181254a7Smrg    assert(buf.back == replacementDchar);
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/++
181254a7Smrg    Returns the number of code units that are required to encode the code point
*b1e83836Smrg    `c` when `C` is the character type used to encode it.
181254a7Smrg  +/
181254a7Smrgubyte codeLength(C)(dchar c) @safe pure nothrow @nogc
181254a7Smrgif (isSomeChar!C)
181254a7Smrg{
181254a7Smrg    static if (C.sizeof == 1)
181254a7Smrg    {
181254a7Smrg        if (c <= 0x7F) return 1;
181254a7Smrg        if (c <= 0x7FF) return 2;
181254a7Smrg        if (c <= 0xFFFF) return 3;
181254a7Smrg        if (c <= 0x10FFFF) return 4;
181254a7Smrg        assert(false);
181254a7Smrg    }
181254a7Smrg    else static if (C.sizeof == 2)
181254a7Smrg    {
181254a7Smrg        return c <= 0xFFFF ? 1 : 2;
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        static assert(C.sizeof == 4);
181254a7Smrg        return 1;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg///
181254a7Smrg@safe pure nothrow @nogc unittest
181254a7Smrg{
181254a7Smrg    assert(codeLength!char('a') == 1);
181254a7Smrg    assert(codeLength!wchar('a') == 1);
181254a7Smrg    assert(codeLength!dchar('a') == 1);
181254a7Smrg
181254a7Smrg    assert(codeLength!char('\U0010FFFF') == 4);
181254a7Smrg    assert(codeLength!wchar('\U0010FFFF') == 2);
181254a7Smrg    assert(codeLength!dchar('\U0010FFFF') == 1);
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/++
*b1e83836Smrg    Returns the number of code units that are required to encode `str`
*b1e83836Smrg    in a string whose character type is `C`. This is particularly useful
181254a7Smrg    when slicing one string with the length of another and the two string
181254a7Smrg    types use different character types.
181254a7Smrg
181254a7Smrg    Params:
181254a7Smrg        C = the character type to get the encoding length for
*b1e83836Smrg        input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
*b1e83836Smrg        to calculate the encoding length from
181254a7Smrg    Returns:
181254a7Smrg        The number of code units in `input` when encoded to `C`
181254a7Smrg  +/
181254a7Smrgsize_t codeLength(C, InputRange)(InputRange input)
*b1e83836Smrgif (isSomeFiniteCharInputRange!InputRange)
181254a7Smrg{
181254a7Smrg    alias EncType = Unqual!(ElementEncodingType!InputRange);
181254a7Smrg    static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length)))
181254a7Smrg        return input.length;
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        size_t total = 0;
181254a7Smrg
*b1e83836Smrg        foreach (c; input.byDchar)
181254a7Smrg            total += codeLength!C(c);
181254a7Smrg
181254a7Smrg        return total;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg///
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    assert(codeLength!char("hello world") ==
*b1e83836Smrg           "hello world".length);
181254a7Smrg    assert(codeLength!wchar("hello world") ==
*b1e83836Smrg           "hello world"w.length);
181254a7Smrg    assert(codeLength!dchar("hello world") ==
*b1e83836Smrg           "hello world"d.length);
181254a7Smrg
181254a7Smrg    assert(codeLength!char(`プログラミング`) ==
*b1e83836Smrg           `プログラミング`.length);
181254a7Smrg    assert(codeLength!wchar(`プログラミング`) ==
*b1e83836Smrg           `プログラミング`w.length);
181254a7Smrg    assert(codeLength!dchar(`プログラミング`) ==
*b1e83836Smrg           `プログラミング`d.length);
181254a7Smrg
181254a7Smrg    string haystack = `Être sans la verité, ça, ce ne serait pas bien.`;
181254a7Smrg    wstring needle = `Être sans la verité`;
181254a7Smrg    assert(haystack[codeLength!char(needle) .. $] ==
181254a7Smrg           `, ça, ce ne serait pas bien.`);
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    import std.algorithm.iteration : filter;
181254a7Smrg    import std.conv : to;
181254a7Smrg    import std.exception;
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    foreach (S; AliasSeq!( char[], const  char[],  string,
181254a7Smrg                          wchar[], const wchar[], wstring,
181254a7Smrg                          dchar[], const dchar[], dstring))
181254a7Smrg    {
181254a7Smrg        foreach (C; AliasSeq!(char, wchar, dchar))
181254a7Smrg        {
181254a7Smrg            assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length);
181254a7Smrg            assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length);
181254a7Smrg            assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) ==
181254a7Smrg                   to!(C[])(`ウェブサイト@La_Verité.com`).length);
181254a7Smrg            assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) ==
181254a7Smrg                   to!(C[])(`ウェブサイト@La_Verité.com`).length);
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg/+
181254a7SmrgInternal helper function:
181254a7Smrg
*b1e83836SmrgReturns true if it is safe to search for the Codepoint `c` inside
181254a7Smrgcode units, without decoding.
181254a7Smrg
181254a7SmrgThis is a runtime check that is used an optimization in various functions,
*b1e83836Smrgparticularly, in `std.string`.
181254a7Smrg  +/
181254a7Smrgpackage bool canSearchInCodeUnits(C)(dchar c)
181254a7Smrgif (isSomeChar!C)
181254a7Smrg{
181254a7Smrg    static if (C.sizeof == 1)
181254a7Smrg         return c <= 0x7F;
181254a7Smrg    else static if (C.sizeof == 2)
181254a7Smrg        return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF);
181254a7Smrg    else static if (C.sizeof == 4)
181254a7Smrg        return true;
181254a7Smrg    else
181254a7Smrg        static assert(0);
181254a7Smrg}
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    assert( canSearchInCodeUnits! char('a'));
181254a7Smrg    assert( canSearchInCodeUnits!wchar('a'));
181254a7Smrg    assert( canSearchInCodeUnits!dchar('a'));
181254a7Smrg    assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF
181254a7Smrg    assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF
181254a7Smrg    assert( canSearchInCodeUnits!wchar('ö'));
181254a7Smrg    assert( canSearchInCodeUnits!dchar('ö'));
181254a7Smrg    assert(!canSearchInCodeUnits! char('日'));
181254a7Smrg    assert( canSearchInCodeUnits!wchar('日'));
181254a7Smrg    assert( canSearchInCodeUnits!dchar('日'));
181254a7Smrg    assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00));
181254a7Smrg    assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00));
181254a7Smrg    assert(!canSearchInCodeUnits! char('\U00010001'));
181254a7Smrg    assert(!canSearchInCodeUnits!wchar('\U00010001'));
181254a7Smrg    assert( canSearchInCodeUnits!dchar('\U00010001'));
181254a7Smrg}
181254a7Smrg
181254a7Smrg/* =================== Validation ======================= */
181254a7Smrg
181254a7Smrg/++
*b1e83836Smrg    Checks to see if `str` is well-formed unicode or not.
181254a7Smrg
181254a7Smrg    Throws:
*b1e83836Smrg        `UTFException` if `str` is not well-formed.
181254a7Smrg  +/
181254a7Smrgvoid validate(S)(in S str) @safe pure
181254a7Smrgif (isSomeString!S)
181254a7Smrg{
181254a7Smrg    immutable len = str.length;
181254a7Smrg    for (size_t i = 0; i < len; )
181254a7Smrg    {
181254a7Smrg        decode(str, i);
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe unittest
*b1e83836Smrg{
*b1e83836Smrg    import std.exception : assertThrown;
*b1e83836Smrg    char[] a = [167, 133, 175];
*b1e83836Smrg    assertThrown!UTFException(validate(a));
*b1e83836Smrg}
181254a7Smrg
*b1e83836Smrg// https://issues.dlang.org/show_bug.cgi?id=12923
*b1e83836Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg    assertThrown((){
181254a7Smrg        char[3]a=[167, 133, 175];
181254a7Smrg        validate(a[]);
181254a7Smrg    }());
181254a7Smrg}
181254a7Smrg
181254a7Smrg/**
181254a7Smrg * Encodes the elements of `s` to UTF-8 and returns a newly allocated
181254a7Smrg * string of the elements.
181254a7Smrg *
181254a7Smrg * Params:
181254a7Smrg *     s = the string to encode
181254a7Smrg * Returns:
181254a7Smrg *     A UTF-8 string
181254a7Smrg * See_Also:
181254a7Smrg *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
181254a7Smrg */
181254a7Smrgstring toUTF8(S)(S s)
*b1e83836Smrgif (isSomeFiniteCharInputRange!S)
181254a7Smrg{
181254a7Smrg    return toUTFImpl!string(s);
181254a7Smrg}
181254a7Smrg
181254a7Smrg///
181254a7Smrg@safe pure unittest
181254a7Smrg{
181254a7Smrg    import std.algorithm.comparison : equal;
181254a7Smrg
181254a7Smrg    // The ö is represented by two UTF-8 code units
181254a7Smrg    assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
181254a7Smrg
181254a7Smrg    // �� is four code units in UTF-8
181254a7Smrg    assert("��"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
181254a7Smrg}
181254a7Smrg
181254a7Smrg@system pure unittest
181254a7Smrg{
181254a7Smrg    import std.algorithm.comparison : equal;
181254a7Smrg    import std.internal.test.dummyrange : ReferenceInputRange;
181254a7Smrg
*b1e83836Smrg    alias RT = ReferenceInputRange!(ElementType!(string));
*b1e83836Smrg    auto r1 = new RT("Hellø");
*b1e83836Smrg    auto r2 = new RT("��");
181254a7Smrg
181254a7Smrg    assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
181254a7Smrg    assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
181254a7Smrg}
181254a7Smrg
181254a7Smrg/**
181254a7Smrg * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
181254a7Smrg * `wstring` of the elements.
181254a7Smrg *
181254a7Smrg * Params:
181254a7Smrg *     s = the range to encode
181254a7Smrg * Returns:
181254a7Smrg *     A UTF-16 string
181254a7Smrg * See_Also:
181254a7Smrg *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
181254a7Smrg */
181254a7Smrgwstring toUTF16(S)(S s)
*b1e83836Smrgif (isSomeFiniteCharInputRange!S)
181254a7Smrg{
181254a7Smrg    return toUTFImpl!wstring(s);
181254a7Smrg}
181254a7Smrg
181254a7Smrg///
181254a7Smrg@safe pure unittest
181254a7Smrg{
181254a7Smrg    import std.algorithm.comparison : equal;
181254a7Smrg
181254a7Smrg    // these graphemes are two code units in UTF-16 and one in UTF-32
181254a7Smrg    assert("��"d.length == 1);
181254a7Smrg    assert("��"d.length == 1);
181254a7Smrg
181254a7Smrg    assert("��"d.toUTF16.equal([0xD852, 0xDF62]));
181254a7Smrg    assert("��"d.toUTF16.equal([0xD801, 0xDC37]));
181254a7Smrg}
181254a7Smrg
181254a7Smrg@system pure unittest
181254a7Smrg{
181254a7Smrg    import std.algorithm.comparison : equal;
181254a7Smrg    import std.internal.test.dummyrange : ReferenceInputRange;
181254a7Smrg
*b1e83836Smrg    alias RT = ReferenceInputRange!(ElementType!(string));
*b1e83836Smrg    auto r1 = new RT("��");
*b1e83836Smrg    auto r2 = new RT("��");
181254a7Smrg
181254a7Smrg    assert(r1.toUTF16.equal([0xD852, 0xDF62]));
181254a7Smrg    assert(r2.toUTF16.equal([0xD801, 0xDC37]));
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/**
181254a7Smrg * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
181254a7Smrg * `dstring` of the elements.
181254a7Smrg *
181254a7Smrg * Params:
181254a7Smrg *     s = the range to encode
181254a7Smrg * Returns:
181254a7Smrg *     A UTF-32 string
181254a7Smrg * See_Also:
181254a7Smrg *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
181254a7Smrg */
*b1e83836Smrgdstring toUTF32(S)(scope S s)
*b1e83836Smrgif (isSomeFiniteCharInputRange!S)
181254a7Smrg{
181254a7Smrg    return toUTFImpl!dstring(s);
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe pure unittest
*b1e83836Smrg{
*b1e83836Smrg    import std.algorithm.comparison : equal;
*b1e83836Smrg
*b1e83836Smrg    // these graphemes are two code units in UTF-16 and one in UTF-32
*b1e83836Smrg    assert("��"w.length == 2);
*b1e83836Smrg    assert("��"w.length == 2);
*b1e83836Smrg
*b1e83836Smrg    assert("��"w.toUTF32.equal([0x00024B62]));
*b1e83836Smrg    assert("��"w.toUTF32.equal([0x00010437]));
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrgprivate T toUTFImpl(T, S)(scope S s)
181254a7Smrg{
181254a7Smrg    static if (is(S : T))
181254a7Smrg    {
181254a7Smrg        return s.idup;
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        import std.array : appender;
181254a7Smrg        auto app = appender!T();
181254a7Smrg
*b1e83836Smrg        static if (is(S == C[], C) || hasLength!S)
181254a7Smrg            app.reserve(s.length);
181254a7Smrg
181254a7Smrg        foreach (c; s.byUTF!(Unqual!(ElementEncodingType!T)))
181254a7Smrg            app.put(c);
181254a7Smrg
181254a7Smrg        return app.data;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg/* =================== toUTFz ======================= */
181254a7Smrg
181254a7Smrg/++
*b1e83836Smrg    Returns a C-style zero-terminated string equivalent to `str`. `str`
*b1e83836Smrg    must not contain embedded `'\0'`'s as any C function will treat the first
*b1e83836Smrg    `'\0'` that it sees as the end of the string. If `str.empty` is
*b1e83836Smrg    `true`, then a string containing only `'\0'` is returned.
181254a7Smrg
*b1e83836Smrg    `toUTFz` accepts any type of string and is templated on the type of
181254a7Smrg    character pointer that you wish to convert to. It will avoid allocating a
181254a7Smrg    new string if it can, but there's a decent chance that it will end up having
181254a7Smrg    to allocate a new string - particularly when dealing with character types
*b1e83836Smrg    other than `char`.
181254a7Smrg
*b1e83836Smrg    $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if
*b1e83836Smrg    anything alters the character one past the end of `str` (which is the
*b1e83836Smrg    `'\0'` character terminating the string), then the string won't be
181254a7Smrg    zero-terminated anymore. The most likely scenarios for that are if you
*b1e83836Smrg    append to `str` and no reallocation takes place or when `str` is a
181254a7Smrg    slice of a larger array, and you alter the character in the larger array
*b1e83836Smrg    which is one character past the end of `str`. Another case where it could
181254a7Smrg    occur would be if you had a mutable character array immediately after
*b1e83836Smrg    `str` in memory (for example, if they're member variables in a
181254a7Smrg    user-defined type with one declared right after the other) and that
*b1e83836Smrg    character array happened to start with `'\0'`. Such scenarios will never
181254a7Smrg    occur if you immediately use the zero-terminated string after calling
*b1e83836Smrg    `toUTFz` and the C function using it doesn't keep a reference to it.
181254a7Smrg    Also, they are unlikely to occur even if you save the zero-terminated string
181254a7Smrg    (the cases above would be among the few examples of where it could happen).
181254a7Smrg    However, if you save the zero-terminate string and want to be absolutely
181254a7Smrg    certain that the string stays zero-terminated, then simply append a
*b1e83836Smrg    `'\0'` to the string and use its `ptr` property rather than calling
*b1e83836Smrg    `toUTFz`.
181254a7Smrg
181254a7Smrg    $(RED Warning 2:) When passing a character pointer to a C function, and the
181254a7Smrg    C function keeps it around for any reason, make sure that you keep a
181254a7Smrg    reference to it in your D code. Otherwise, it may go away during a garbage
181254a7Smrg    collection cycle and cause a nasty bug when the C code tries to use it.
181254a7Smrg  +/
181254a7Smrgtemplate toUTFz(P)
*b1e83836Smrgif (isPointer!P && isSomeChar!(typeof(*P.init)))
181254a7Smrg{
181254a7Smrg    P toUTFz(S)(S str) @safe pure
*b1e83836Smrg    if (isSomeString!S)
181254a7Smrg    {
181254a7Smrg        return toUTFzImpl!(P, S)(str);
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg///
181254a7Smrg@safe pure unittest
181254a7Smrg{
181254a7Smrg    auto p1 = toUTFz!(char*)("hello world");
181254a7Smrg    auto p2 = toUTFz!(const(char)*)("hello world");
181254a7Smrg    auto p3 = toUTFz!(immutable(char)*)("hello world");
181254a7Smrg    auto p4 = toUTFz!(char*)("hello world"d);
181254a7Smrg    auto p5 = toUTFz!(const(wchar)*)("hello world");
181254a7Smrg    auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
181254a7Smrg}
181254a7Smrg
*b1e83836Smrgprivate P toUTFzImpl(P, S)(return scope S str) @safe pure
*b1e83836Smrgif (is(immutable typeof(*P.init) == typeof(str[0])))
181254a7Smrg//immutable(C)[] -> C*, const(C)*, or immutable(C)*
181254a7Smrg{
181254a7Smrg    if (str.empty)
181254a7Smrg    {
181254a7Smrg        typeof(*P.init)[] retval = ['\0'];
181254a7Smrg
181254a7Smrg        auto trustedPtr() @trusted { return retval.ptr; }
181254a7Smrg        return trustedPtr();
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    alias C = Unqual!(ElementEncodingType!S);
181254a7Smrg
181254a7Smrg    //If the P is mutable, then we have to make a copy.
181254a7Smrg    static if (is(Unqual!(typeof(*P.init)) == typeof(*P.init)))
181254a7Smrg    {
181254a7Smrg        return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        if (!__ctfe)
181254a7Smrg        {
181254a7Smrg            auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
181254a7Smrg            immutable p = trustedPtrAdd(str);
181254a7Smrg
181254a7Smrg            // Peek past end of str, if it's 0, no conversion necessary.
181254a7Smrg            // Note that the compiler will put a 0 past the end of static
181254a7Smrg            // strings, and the storage allocator will put a 0 past the end
181254a7Smrg            // of newly allocated char[]'s.
181254a7Smrg            // Is p dereferenceable? A simple test: if the p points to an
181254a7Smrg            // address multiple of 4, then conservatively assume the pointer
181254a7Smrg            // might be pointing to a new block of memory, which might be
181254a7Smrg            // unreadable. Otherwise, it's definitely pointing to valid
181254a7Smrg            // memory.
181254a7Smrg            if ((cast(size_t) p & 3) && *p == '\0')
181254a7Smrg                return &str[0];
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
*b1e83836Smrgprivate P toUTFzImpl(P, S)(return scope S str) @safe pure
*b1e83836Smrgif (is(typeof(str[0]) C) && is(immutable typeof(*P.init) == immutable C) && !is(C == immutable))
181254a7Smrg//C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
181254a7Smrg{
*b1e83836Smrg    alias InChar  = typeof(str[0]);
181254a7Smrg    alias OutChar = typeof(*P.init);
181254a7Smrg
181254a7Smrg    //const(C)[] -> const(C)* or
181254a7Smrg    //C[] -> C* or const(C)*
181254a7Smrg    static if (( is(const(Unqual!InChar) == InChar) &&  is(const(Unqual!OutChar) == OutChar)) ||
181254a7Smrg               (!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar)))
181254a7Smrg    {
181254a7Smrg        if (!__ctfe)
181254a7Smrg        {
181254a7Smrg            auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; }
181254a7Smrg            auto p = trustedPtrAdd(str);
181254a7Smrg
181254a7Smrg            if ((cast(size_t) p & 3) && *p == '\0')
181254a7Smrg                return &str[0];
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        str ~= '\0';
181254a7Smrg        return &str[0];
181254a7Smrg    }
181254a7Smrg    //const(C)[] -> C* or immutable(C)* or
181254a7Smrg    //C[] -> immutable(C)*
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        import std.array : uninitializedArray;
181254a7Smrg        auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1);
181254a7Smrg        copy[0 .. $ - 1] = str[];
181254a7Smrg        copy[$ - 1] = '\0';
181254a7Smrg
181254a7Smrg        auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; }
181254a7Smrg        return trustedCast(copy);
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrgprivate P toUTFzImpl(P, S)(S str) @safe pure
*b1e83836Smrgif (!is(immutable typeof(*P.init) == immutable typeof(str[0])))
181254a7Smrg//C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
181254a7Smrg{
181254a7Smrg    import std.array : appender;
181254a7Smrg    auto retval = appender!(typeof(*P.init)[])();
181254a7Smrg
181254a7Smrg    foreach (dchar c; str)
181254a7Smrg        retval.put(c);
181254a7Smrg    retval.put('\0');
181254a7Smrg
181254a7Smrg    return () @trusted { return cast(P) retval.data.ptr; } ();
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe pure unittest
181254a7Smrg{
181254a7Smrg    import core.exception : AssertError;
181254a7Smrg    import std.algorithm;
181254a7Smrg    import std.conv : to;
181254a7Smrg    import std.exception;
181254a7Smrg    import std.string : format;
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    foreach (S; AliasSeq!(string, wstring, dstring))
181254a7Smrg    {
181254a7Smrg        alias C = Unqual!(ElementEncodingType!S);
181254a7Smrg
181254a7Smrg        auto s1 = to!S("hello\U00010143\u0100\U00010143");
181254a7Smrg        auto temp = new C[](s1.length + 1);
181254a7Smrg        temp[0 .. $ - 1] = s1[0 .. $];
181254a7Smrg        temp[$ - 1] = '\n';
181254a7Smrg        --temp.length;
181254a7Smrg        auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); }
181254a7Smrg        auto s2 = trustedAssumeUnique(temp);
181254a7Smrg        assert(s1 == s2);
181254a7Smrg
181254a7Smrg        void trustedCStringAssert(P, S)(S s) @trusted
181254a7Smrg        {
181254a7Smrg            auto p = toUTFz!P(s);
181254a7Smrg            assert(p[0 .. s.length] == s);
181254a7Smrg            assert(p[s.length] == '\0');
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*))
181254a7Smrg        {
181254a7Smrg            trustedCStringAssert!P(s1);
181254a7Smrg            trustedCStringAssert!P(s2);
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg
181254a7Smrg    static void test(P, S)(S s, size_t line = __LINE__) @trusted
181254a7Smrg    {
181254a7Smrg        static size_t zeroLen(C)(const(C)* ptr) @trusted
181254a7Smrg        {
181254a7Smrg            size_t len = 0;
181254a7Smrg            while (*ptr != '\0') { ++ptr; ++len; }
181254a7Smrg            return len;
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        auto p = toUTFz!P(s);
181254a7Smrg        immutable len = zeroLen(p);
181254a7Smrg        enforce(cmp(s, p[0 .. len]) == 0,
181254a7Smrg                new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof),
181254a7Smrg                                __FILE__, line));
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*,
181254a7Smrg                          dchar*, const(dchar)*, immutable(dchar)*))
181254a7Smrg    {
181254a7Smrg        test!P("hello\U00010143\u0100\U00010143");
181254a7Smrg    }
181254a7Smrg    foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
181254a7Smrg                          dchar*, const(dchar)*, immutable(dchar)*))
181254a7Smrg    {
181254a7Smrg        test!P("hello\U00010143\u0100\U00010143"w);
181254a7Smrg    }
181254a7Smrg    foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
181254a7Smrg                          wchar*, const(wchar)*, immutable(wchar)*))
181254a7Smrg    {
181254a7Smrg        test!P("hello\U00010143\u0100\U00010143"d);
181254a7Smrg    }
181254a7Smrg    foreach (S; AliasSeq!( char[], const( char)[],
181254a7Smrg                          wchar[], const(wchar)[],
181254a7Smrg                          dchar[], const(dchar)[]))
181254a7Smrg    {
181254a7Smrg        auto s = to!S("hello\U00010143\u0100\U00010143");
181254a7Smrg
181254a7Smrg        foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*,
181254a7Smrg                              wchar*, const(wchar)*, immutable(wchar)*,
181254a7Smrg                              dchar*, const(dchar)*, immutable(dchar)*))
181254a7Smrg        {
181254a7Smrg            test!P(s);
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/++
*b1e83836Smrg    `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`.
181254a7Smrg
*b1e83836Smrg    Encodes string `s` into UTF-16 and returns the encoded string.
*b1e83836Smrg    `toUTF16z` is suitable for calling the 'W' functions in the Win32 API
*b1e83836Smrg    that take an `LPCWSTR` argument.
181254a7Smrg  +/
181254a7Smrgconst(wchar)* toUTF16z(C)(const(C)[] str) @safe pure
181254a7Smrgif (isSomeChar!C)
181254a7Smrg{
181254a7Smrg    return toUTFz!(const(wchar)*)(str);
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg///
*b1e83836Smrg@system unittest
*b1e83836Smrg{
*b1e83836Smrg    string str = "Hello, World!";
*b1e83836Smrg    const(wchar)* p = str.toUTF16z;
*b1e83836Smrg    assert(p[str.length] == '\0');
*b1e83836Smrg}
*b1e83836Smrg
181254a7Smrg@safe pure unittest
181254a7Smrg{
181254a7Smrg    import std.conv : to;
181254a7Smrg    //toUTFz is already thoroughly tested, so this will just verify that
181254a7Smrg    //toUTF16z compiles properly for the various string types.
181254a7Smrg    foreach (S; AliasSeq!(string, wstring, dstring))
181254a7Smrg        assert(toUTF16z(to!S("hello world")) !is null);
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/* ================================ tests ================================== */
181254a7Smrg
181254a7Smrg@safe pure unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    assert(toUTF16("hello"c) == "hello");
181254a7Smrg    assert(toUTF32("hello"c) == "hello");
181254a7Smrg    assert(toUTF8 ("hello"w) == "hello");
181254a7Smrg    assert(toUTF32("hello"w) == "hello");
181254a7Smrg    assert(toUTF8 ("hello"d) == "hello");
181254a7Smrg    assert(toUTF16("hello"d) == "hello");
181254a7Smrg
181254a7Smrg    assert(toUTF16("hel\u1234o"c) == "hel\u1234o");
181254a7Smrg    assert(toUTF32("hel\u1234o"c) == "hel\u1234o");
181254a7Smrg    assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o");
181254a7Smrg    assert(toUTF32("hel\u1234o"w) == "hel\u1234o");
181254a7Smrg    assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o");
181254a7Smrg    assert(toUTF16("hel\u1234o"d) == "hel\u1234o");
181254a7Smrg
181254a7Smrg    assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
181254a7Smrg    assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo");
181254a7Smrg    assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
181254a7Smrg    assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo");
181254a7Smrg    assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
181254a7Smrg    assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo");
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/++
*b1e83836Smrg    Returns the total number of code points encoded in `str`.
181254a7Smrg
181254a7Smrg    Supercedes: This function supercedes $(LREF toUCSindex).
181254a7Smrg
181254a7Smrg    Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
181254a7Smrg
181254a7Smrg    Throws:
*b1e83836Smrg        `UTFException` if `str` is not well-formed.
181254a7Smrg  +/
*b1e83836Smrgsize_t count(C)(const(C)[] str) @safe pure nothrow @nogc
181254a7Smrgif (isSomeChar!C)
181254a7Smrg{
*b1e83836Smrg    return walkLength(str.byDchar);
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe pure nothrow @nogc unittest
*b1e83836Smrg{
*b1e83836Smrg    assert(count("") == 0);
*b1e83836Smrg    assert(count("a") == 1);
*b1e83836Smrg    assert(count("abc") == 3);
*b1e83836Smrg    assert(count("\u20AC100") == 4);
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe pure nothrow @nogc unittest
181254a7Smrg{
181254a7Smrg    import std.exception;
181254a7Smrg    assertCTFEable!(
181254a7Smrg    {
181254a7Smrg    assert(count("") == 0);
181254a7Smrg    assert(count("a") == 1);
181254a7Smrg    assert(count("abc") == 3);
181254a7Smrg    assert(count("\u20AC100") == 4);
181254a7Smrg    });
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg// Ranges of code units for testing.
*b1e83836Smrgversion (StdUnittest)
181254a7Smrg{
*b1e83836Smrgprivate:
181254a7Smrg    struct InputCU(C)
181254a7Smrg    {
181254a7Smrg        import std.conv : to;
181254a7Smrg        @property bool empty() { return _str.empty; }
181254a7Smrg        @property C front() { return _str[0]; }
181254a7Smrg        void popFront() { _str = _str[1 .. $]; }
181254a7Smrg
181254a7Smrg        this(inout(C)[] str)
181254a7Smrg        {
181254a7Smrg            _str = to!(C[])(str);
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        C[] _str;
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    struct BidirCU(C)
181254a7Smrg    {
181254a7Smrg        import std.conv : to;
181254a7Smrg        @property bool empty() { return _str.empty; }
181254a7Smrg        @property C front() { return _str[0]; }
181254a7Smrg        void popFront() { _str = _str[1 .. $]; }
181254a7Smrg        @property C back() { return _str[$ - 1]; }
181254a7Smrg        void popBack() { _str = _str[0 .. $ - 1]; }
181254a7Smrg        @property auto save() { return BidirCU(_str); }
181254a7Smrg        @property size_t length() { return _str.length; }
181254a7Smrg
181254a7Smrg        this(inout(C)[] str)
181254a7Smrg        {
181254a7Smrg            _str = to!(C[])(str);
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        C[] _str;
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    struct RandomCU(C)
181254a7Smrg    {
181254a7Smrg        import std.conv : to;
181254a7Smrg        @property bool empty() { return _str.empty; }
181254a7Smrg        @property C front() { return _str[0]; }
181254a7Smrg        void popFront() { _str = _str[1 .. $]; }
181254a7Smrg        @property C back() { return _str[$ - 1]; }
181254a7Smrg        void popBack() { _str = _str[0 .. $ - 1]; }
181254a7Smrg        @property auto save() { return RandomCU(_str); }
181254a7Smrg        @property size_t length() { return _str.length; }
181254a7Smrg        C opIndex(size_t i) { return _str[i]; }
181254a7Smrg        auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); }
181254a7Smrg
181254a7Smrg        this(inout(C)[] str)
181254a7Smrg        {
181254a7Smrg            _str = to!(C[])(str);
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        C[] _str;
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    class RefBidirCU(C)
181254a7Smrg    {
181254a7Smrg        import std.conv : to;
181254a7Smrg        @property bool empty() { return _str.empty; }
181254a7Smrg        @property C front() { return _str[0]; }
181254a7Smrg        void popFront() { _str = _str[1 .. $]; }
181254a7Smrg        @property C back() { return _str[$ - 1]; }
181254a7Smrg        void popBack() { _str = _str[0 .. $ - 1]; }
181254a7Smrg        @property auto save() { return new RefBidirCU(_str); }
181254a7Smrg        @property size_t length() { return _str.length; }
181254a7Smrg
181254a7Smrg        this(inout(C)[] str)
181254a7Smrg        {
181254a7Smrg            _str = to!(C[])(str);
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        C[] _str;
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    class RefRandomCU(C)
181254a7Smrg    {
181254a7Smrg        import std.conv : to;
181254a7Smrg        @property bool empty() { return _str.empty; }
181254a7Smrg        @property C front() { return _str[0]; }
181254a7Smrg        void popFront() { _str = _str[1 .. $]; }
181254a7Smrg        @property C back() { return _str[$ - 1]; }
181254a7Smrg        void popBack() { _str = _str[0 .. $ - 1]; }
181254a7Smrg        @property auto save() { return new RefRandomCU(_str); }
181254a7Smrg        @property size_t length() { return _str.length; }
181254a7Smrg        C opIndex(size_t i) { return _str[i]; }
181254a7Smrg        auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); }
181254a7Smrg
181254a7Smrg        this(inout(C)[] str)
181254a7Smrg        {
181254a7Smrg            _str = to!(C[])(str);
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        C[] _str;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg
181254a7Smrg/**
181254a7Smrg * Inserted in place of invalid UTF sequences.
181254a7Smrg *
181254a7Smrg * References:
181254a7Smrg *      $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character)
181254a7Smrg */
181254a7Smrgenum dchar replacementDchar = '\uFFFD';
181254a7Smrg
181254a7Smrg/********************************************
181254a7Smrg * Iterate a range of char, wchar, or dchars by code unit.
181254a7Smrg *
181254a7Smrg * The purpose is to bypass the special case decoding that
181254a7Smrg * $(REF front, std,range,primitives) does to character arrays. As a result,
181254a7Smrg * using ranges with `byCodeUnit` can be `nothrow` while
181254a7Smrg * $(REF front, std,range,primitives) throws when it encounters invalid Unicode
181254a7Smrg * sequences.
181254a7Smrg *
181254a7Smrg * A code unit is a building block of the UTF encodings. Generally, an
181254a7Smrg * individual code unit does not represent what's perceived as a full
181254a7Smrg * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters
181254a7Smrg * are encoded with multiple code units. For example, the UTF-8 code units for
181254a7Smrg * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit`
181254a7Smrg * often does not form a character on its own. Attempting to treat it as
181254a7Smrg * one while iterating over the resulting range will give nonsensical results.
181254a7Smrg *
181254a7Smrg * Params:
*b1e83836Smrg *      r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
*b1e83836Smrg *      of characters (including strings) or a type that implicitly converts to a string type.
181254a7Smrg * Returns:
181254a7Smrg *      If `r` is not an auto-decodable string (i.e. a narrow string or a
181254a7Smrg *      user-defined type that implicits converts to a string type), then `r`
181254a7Smrg *      is returned.
181254a7Smrg *
181254a7Smrg *      Otherwise, `r` is converted to its corresponding string type (if it's
181254a7Smrg *      not already a string) and wrapped in a random-access range where the
181254a7Smrg *      element encoding type of the string (its code unit) is the element type
181254a7Smrg *      of the range, and that range returned. The range has slicing.
181254a7Smrg *
181254a7Smrg *      If `r` is quirky enough to be a struct or class which is an input range
181254a7Smrg *      of characters on its own (i.e. it has the input range API as member
181254a7Smrg *      functions), $(I and) it's implicitly convertible to a string type, then
181254a7Smrg *      `r` is returned, and no implicit conversion takes place.
*b1e83836Smrg *
*b1e83836Smrg *      If `r` is wrapped in a new range, then that range has a `source`
*b1e83836Smrg *      property for returning the string that's currently contained within that
*b1e83836Smrg *      range.
*b1e83836Smrg *
181254a7Smrg * See_Also:
181254a7Smrg *      Refer to the $(MREF std, uni) docs for a reference on Unicode
181254a7Smrg *      terminology.
181254a7Smrg *
181254a7Smrg *      For a range that iterates by grapheme cluster (written character) see
181254a7Smrg *      $(REF byGrapheme, std,uni).
181254a7Smrg */
181254a7Smrgauto byCodeUnit(R)(R r)
*b1e83836Smrgif ((isConvertibleToString!R && !isStaticArray!R) ||
*b1e83836Smrg    (isInputRange!R && isSomeChar!(ElementEncodingType!R)))
181254a7Smrg{
*b1e83836Smrg    import std.traits : StringTypeOf;
*b1e83836Smrg    static if (// This would be cleaner if we had a way to check whether a type
181254a7Smrg               // was a range without any implicit conversions.
181254a7Smrg               (isAutodecodableString!R && !__traits(hasMember, R, "empty") &&
181254a7Smrg                !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
181254a7Smrg    {
181254a7Smrg        static struct ByCodeUnitImpl
181254a7Smrg        {
181254a7Smrg        @safe pure nothrow @nogc:
181254a7Smrg
*b1e83836Smrg            @property bool empty() const     { return source.length == 0; }
*b1e83836Smrg            @property auto ref front() inout { return source[0]; }
*b1e83836Smrg            void popFront()                  { source = source[1 .. $]; }
181254a7Smrg
*b1e83836Smrg            @property auto save() { return ByCodeUnitImpl(source.save); }
181254a7Smrg
*b1e83836Smrg            @property auto ref back() inout { return source[$ - 1]; }
*b1e83836Smrg            void popBack()                  { source = source[0 .. $-1]; }
181254a7Smrg
*b1e83836Smrg            auto ref opIndex(size_t index) inout     { return source[index]; }
*b1e83836Smrg            auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(source[lower .. upper]); }
181254a7Smrg
*b1e83836Smrg            @property size_t length() const { return source.length; }
181254a7Smrg            alias opDollar = length;
181254a7Smrg
*b1e83836Smrg            StringTypeOf!R source;
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        static assert(isRandomAccessRange!ByCodeUnitImpl);
181254a7Smrg
181254a7Smrg        return ByCodeUnitImpl(r);
181254a7Smrg    }
*b1e83836Smrg    else static if (!isInputRange!R ||
*b1e83836Smrg                    (is(R : const dchar[]) && !__traits(hasMember, R, "empty") &&
*b1e83836Smrg                    !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront")))
181254a7Smrg    {
181254a7Smrg        return cast(StringTypeOf!R) r;
181254a7Smrg    }
181254a7Smrg    else
181254a7Smrg    {
181254a7Smrg        // byCodeUnit for ranges and dchar[] is a no-op
181254a7Smrg        return r;
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg///
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    import std.range.primitives;
*b1e83836Smrg    import std.traits : isAutodecodableString;
181254a7Smrg
181254a7Smrg    auto r = "Hello, World!".byCodeUnit();
181254a7Smrg    static assert(hasLength!(typeof(r)));
181254a7Smrg    static assert(hasSlicing!(typeof(r)));
181254a7Smrg    static assert(isRandomAccessRange!(typeof(r)));
181254a7Smrg    static assert(is(ElementType!(typeof(r)) == immutable char));
181254a7Smrg
*b1e83836Smrg    // contrast with the range capabilities of standard strings (with or
*b1e83836Smrg    // without autodecoding enabled).
181254a7Smrg    auto s = "Hello, World!";
181254a7Smrg    static assert(isBidirectionalRange!(typeof(r)));
*b1e83836Smrg    static if (isAutodecodableString!(typeof(s)))
*b1e83836Smrg    {
*b1e83836Smrg        // with autodecoding enabled, strings are non-random-access ranges of
*b1e83836Smrg        // dchar.
181254a7Smrg        static assert(is(ElementType!(typeof(s)) == dchar));
181254a7Smrg        static assert(!isRandomAccessRange!(typeof(s)));
181254a7Smrg        static assert(!hasSlicing!(typeof(s)));
181254a7Smrg        static assert(!hasLength!(typeof(s)));
181254a7Smrg    }
*b1e83836Smrg    else
*b1e83836Smrg    {
*b1e83836Smrg        // without autodecoding, strings are normal arrays.
*b1e83836Smrg        static assert(is(ElementType!(typeof(s)) == immutable char));
*b1e83836Smrg        static assert(isRandomAccessRange!(typeof(s)));
*b1e83836Smrg        static assert(hasSlicing!(typeof(s)));
*b1e83836Smrg        static assert(hasLength!(typeof(s)));
*b1e83836Smrg    }
*b1e83836Smrg}
181254a7Smrg
181254a7Smrg/// `byCodeUnit` does no Unicode decoding
181254a7Smrg@safe unittest
181254a7Smrg{
181254a7Smrg    string noel1 = "noe\u0308l"; // noël using e + combining diaeresis
181254a7Smrg    assert(noel1.byCodeUnit[2] != 'ë');
181254a7Smrg    assert(noel1.byCodeUnit[2] == 'e');
181254a7Smrg
181254a7Smrg    string noel2 = "no\u00EBl"; // noël using a precomposed ë character
181254a7Smrg    // Because string is UTF-8, the code unit at index 2 is just
181254a7Smrg    // the first of a sequence that encodes 'ë'
181254a7Smrg    assert(noel2.byCodeUnit[2] != 'ë');
181254a7Smrg}
181254a7Smrg
*b1e83836Smrg/// `byCodeUnit` exposes a `source` property when wrapping narrow strings.
*b1e83836Smrg@safe unittest
*b1e83836Smrg{
*b1e83836Smrg    import std.algorithm.comparison : equal;
*b1e83836Smrg    import std.range : popFrontN;
*b1e83836Smrg    import std.traits : isAutodecodableString;
*b1e83836Smrg    {
*b1e83836Smrg        auto range = byCodeUnit("hello world");
*b1e83836Smrg        range.popFrontN(3);
*b1e83836Smrg        assert(equal(range.save, "lo world"));
*b1e83836Smrg        static if (isAutodecodableString!string) // only enabled with autodecoding
*b1e83836Smrg        {
*b1e83836Smrg            string str = range.source;
*b1e83836Smrg            assert(str == "lo world");
*b1e83836Smrg        }
*b1e83836Smrg    }
*b1e83836Smrg    // source only exists if the range was wrapped
*b1e83836Smrg    {
*b1e83836Smrg        auto range = byCodeUnit("hello world"d);
*b1e83836Smrg        static assert(!__traits(compiles, range.source));
*b1e83836Smrg    }
*b1e83836Smrg}
*b1e83836Smrg
181254a7Smrg@safe pure nothrow @nogc unittest
181254a7Smrg{
181254a7Smrg    import std.range;
181254a7Smrg    {
181254a7Smrg        enum testStr = "������ hello ディラン";
181254a7Smrg        char[testStr.length] s;
181254a7Smrg        int i;
181254a7Smrg        foreach (c; testStr.byCodeUnit().byCodeUnit())
181254a7Smrg        {
181254a7Smrg            s[i++] = c;
181254a7Smrg        }
181254a7Smrg        assert(s == testStr);
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        enum testStr = "������ hello ディラン"w;
181254a7Smrg        wchar[testStr.length] s;
181254a7Smrg        int i;
181254a7Smrg        foreach (c; testStr.byCodeUnit().byCodeUnit())
181254a7Smrg        {
181254a7Smrg            s[i++] = c;
181254a7Smrg        }
181254a7Smrg        assert(s == testStr);
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        enum testStr = "������ hello ディラン"d;
181254a7Smrg        dchar[testStr.length] s;
181254a7Smrg        int i;
181254a7Smrg        foreach (c; testStr.byCodeUnit().byCodeUnit())
181254a7Smrg        {
181254a7Smrg            s[i++] = c;
181254a7Smrg        }
181254a7Smrg        assert(s == testStr);
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        auto bcu = "hello".byCodeUnit();
181254a7Smrg        assert(bcu.length == 5);
181254a7Smrg        assert(bcu[3] == 'l');
181254a7Smrg        assert(bcu[2 .. 4][1] == 'l');
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        char[5] orig = "hello";
181254a7Smrg        auto bcu = orig[].byCodeUnit();
181254a7Smrg        bcu.front = 'H';
181254a7Smrg        assert(bcu.front == 'H');
181254a7Smrg        bcu[1] = 'E';
181254a7Smrg        assert(bcu[1] == 'E');
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        auto bcu = "hello".byCodeUnit().byCodeUnit();
181254a7Smrg        static assert(isForwardRange!(typeof(bcu)));
*b1e83836Smrg        static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
181254a7Smrg        auto s = bcu.save;
181254a7Smrg        bcu.popFront();
181254a7Smrg        assert(s.front == 'h');
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        auto bcu = "hello".byCodeUnit();
181254a7Smrg        static assert(hasSlicing!(typeof(bcu)));
181254a7Smrg        static assert(isBidirectionalRange!(typeof(bcu)));
*b1e83836Smrg        static assert(is(typeof(bcu) == struct) == isAutodecodableString!string);
181254a7Smrg        static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
181254a7Smrg        auto ret = bcu.retro;
181254a7Smrg        assert(ret.front == 'o');
181254a7Smrg        ret.popFront();
181254a7Smrg        assert(ret.front == 'l');
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        auto bcu = "κόσμε"w.byCodeUnit();
181254a7Smrg        static assert(hasSlicing!(typeof(bcu)));
181254a7Smrg        static assert(isBidirectionalRange!(typeof(bcu)));
*b1e83836Smrg        static assert(is(typeof(bcu) == struct) == isAutodecodableString!wstring);
181254a7Smrg        static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
181254a7Smrg        auto ret = bcu.retro;
181254a7Smrg        assert(ret.front == 'ε');
181254a7Smrg        ret.popFront();
181254a7Smrg        assert(ret.front == 'μ');
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        static struct Stringish
181254a7Smrg        {
181254a7Smrg            string s;
181254a7Smrg            alias s this;
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        auto orig = Stringish("\U0010fff8 �� foo ��");
181254a7Smrg        auto bcu = orig.byCodeUnit();
181254a7Smrg        static assert(is(typeof(bcu) == struct));
*b1e83836Smrg        static assert(!is(typeof(bcu) == Stringish) == isAutodecodableString!Stringish);
181254a7Smrg        static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == immutable char));
181254a7Smrg        assert(bcu.front == cast(char) 244);
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        static struct WStringish
181254a7Smrg        {
181254a7Smrg            wstring s;
181254a7Smrg            alias s this;
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        auto orig = WStringish("\U0010fff8 �� foo ��"w);
181254a7Smrg        auto bcu = orig.byCodeUnit();
181254a7Smrg        static assert(is(typeof(bcu) == struct));
*b1e83836Smrg        static assert(!is(typeof(bcu) == WStringish) == isAutodecodableString!WStringish);
181254a7Smrg        static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
181254a7Smrg        assert(bcu.front == cast(wchar) 56319);
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        static struct DStringish
181254a7Smrg        {
181254a7Smrg            dstring s;
181254a7Smrg            alias s this;
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        auto orig = DStringish("\U0010fff8 �� foo ��"d);
181254a7Smrg        auto bcu = orig.byCodeUnit();
181254a7Smrg        static assert(is(typeof(bcu) == dstring));
181254a7Smrg        static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
181254a7Smrg        assert(bcu.front == cast(dchar) 1114104);
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        static struct FuncStringish
181254a7Smrg        {
181254a7Smrg            string str;
181254a7Smrg            string s() pure nothrow @nogc { return str; }
181254a7Smrg            alias s this;
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        auto orig = FuncStringish("\U0010fff8 �� foo ��");
181254a7Smrg        auto bcu = orig.byCodeUnit();
*b1e83836Smrg        static if (isAutodecodableString!FuncStringish)
181254a7Smrg            static assert(is(typeof(bcu) == struct));
*b1e83836Smrg        else
*b1e83836Smrg            static assert(is(typeof(bcu) == string));
181254a7Smrg        static assert(!is(typeof(bcu) == FuncStringish));
181254a7Smrg        static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == immutable char));
181254a7Smrg        assert(bcu.front == cast(char) 244);
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        static struct Range
181254a7Smrg        {
181254a7Smrg            string data;
181254a7Smrg            bool empty() pure nothrow @nogc { return data.empty; }
181254a7Smrg            char front() pure nothrow @nogc { return data[0]; }
181254a7Smrg            void popFront() pure nothrow @nogc { data = data[1 .. $]; }
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        auto orig = Range("\U0010fff8 �� foo ��");
181254a7Smrg        auto bcu = orig.byCodeUnit();
181254a7Smrg        static assert(is(typeof(bcu) == Range));
181254a7Smrg        static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == char));
181254a7Smrg        assert(bcu.front == cast(char) 244);
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        static struct WRange
181254a7Smrg        {
181254a7Smrg            wstring data;
181254a7Smrg            bool empty() pure nothrow @nogc { return data.empty; }
181254a7Smrg            wchar front() pure nothrow @nogc { return data[0]; }
181254a7Smrg            void popFront() pure nothrow @nogc { data = data[1 .. $]; }
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        auto orig = WRange("\U0010fff8 �� foo ��"w);
181254a7Smrg        auto bcu = orig.byCodeUnit();
181254a7Smrg        static assert(is(typeof(bcu) == WRange));
181254a7Smrg        static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == wchar));
181254a7Smrg        assert(bcu.front == 56319);
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        static struct DRange
181254a7Smrg        {
181254a7Smrg            dstring data;
181254a7Smrg            bool empty() pure nothrow @nogc { return data.empty; }
181254a7Smrg            dchar front() pure nothrow @nogc { return data[0]; }
181254a7Smrg            void popFront() pure nothrow @nogc { data = data[1 .. $]; }
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        auto orig = DRange("\U0010fff8 �� foo ��"d);
181254a7Smrg        auto bcu = orig.byCodeUnit();
181254a7Smrg        static assert(is(typeof(bcu) == DRange));
181254a7Smrg        static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == dchar));
181254a7Smrg        assert(bcu.front == 1114104);
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        static struct RangeAndStringish
181254a7Smrg        {
181254a7Smrg            bool empty() pure nothrow @nogc { return data.empty; }
181254a7Smrg            char front() pure nothrow @nogc { return data[0]; }
181254a7Smrg            void popFront() pure nothrow @nogc { data = data[1 .. $]; }
181254a7Smrg
181254a7Smrg            string data;
181254a7Smrg            string s;
181254a7Smrg            alias s this;
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        auto orig = RangeAndStringish("test.d", "other");
181254a7Smrg        auto bcu = orig.byCodeUnit();
181254a7Smrg        static assert(is(typeof(bcu) == RangeAndStringish));
181254a7Smrg        static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == char));
181254a7Smrg        assert(bcu.front == 't');
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        static struct WRangeAndStringish
181254a7Smrg        {
181254a7Smrg            bool empty() pure nothrow @nogc { return data.empty; }
181254a7Smrg            wchar front() pure nothrow @nogc { return data[0]; }
181254a7Smrg            void popFront() pure nothrow @nogc { data = data[1 .. $]; }
181254a7Smrg
181254a7Smrg            wstring data;
181254a7Smrg            wstring s;
181254a7Smrg            alias s this;
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        auto orig = WRangeAndStringish("test.d"w, "other"w);
181254a7Smrg        auto bcu = orig.byCodeUnit();
181254a7Smrg        static assert(is(typeof(bcu) == WRangeAndStringish));
181254a7Smrg        static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == wchar));
181254a7Smrg        assert(bcu.front == 't');
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        static struct DRangeAndStringish
181254a7Smrg        {
181254a7Smrg            bool empty() pure nothrow @nogc { return data.empty; }
181254a7Smrg            dchar front() pure nothrow @nogc { return data[0]; }
181254a7Smrg            void popFront() pure nothrow @nogc { data = data[1 .. $]; }
181254a7Smrg
181254a7Smrg            dstring data;
181254a7Smrg            dstring s;
181254a7Smrg            alias s this;
181254a7Smrg        }
181254a7Smrg
181254a7Smrg        auto orig = DRangeAndStringish("test.d"d, "other"d);
181254a7Smrg        auto bcu = orig.byCodeUnit();
181254a7Smrg        static assert(is(typeof(bcu) == DRangeAndStringish));
181254a7Smrg        static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit())));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == dchar));
181254a7Smrg        assert(bcu.front == 't');
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        enum Enum : string { a = "test.d" }
181254a7Smrg
181254a7Smrg        auto orig = Enum.a;
181254a7Smrg        auto bcu = orig.byCodeUnit();
181254a7Smrg        static assert(!is(typeof(bcu) == Enum));
*b1e83836Smrg        static if (isAutodecodableString!Enum)
181254a7Smrg            static assert(is(typeof(bcu) == struct));
*b1e83836Smrg        else
*b1e83836Smrg            static assert(is(typeof(bcu) == string));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == immutable char));
181254a7Smrg        assert(bcu.front == 't');
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        enum WEnum : wstring { a = "test.d"w }
181254a7Smrg
181254a7Smrg        auto orig = WEnum.a;
181254a7Smrg        auto bcu = orig.byCodeUnit();
181254a7Smrg        static assert(!is(typeof(bcu) == WEnum));
*b1e83836Smrg        static if (isAutodecodableString!WEnum)
181254a7Smrg            static assert(is(typeof(bcu) == struct));
*b1e83836Smrg        else
*b1e83836Smrg            static assert(is(typeof(bcu) == wstring));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == immutable wchar));
181254a7Smrg        assert(bcu.front == 't');
181254a7Smrg    }
181254a7Smrg    {
181254a7Smrg        enum DEnum : dstring { a = "test.d"d }
181254a7Smrg
181254a7Smrg        auto orig = DEnum.a;
181254a7Smrg        auto bcu = orig.byCodeUnit();
181254a7Smrg        static assert(is(typeof(bcu) == dstring));
181254a7Smrg        static assert(is(ElementType!(typeof(bcu)) == immutable dchar));
181254a7Smrg        assert(bcu.front == 't');
181254a7Smrg    }
181254a7Smrg
*b1e83836Smrg    static if (autodecodeStrings)
*b1e83836Smrg    {
181254a7Smrg        static assert(!is(typeof(byCodeUnit("hello")) == string));
181254a7Smrg        static assert(!is(typeof(byCodeUnit("hello"w)) == wstring));
*b1e83836Smrg    }
*b1e83836Smrg    else
*b1e83836Smrg    {
*b1e83836Smrg        static assert(is(typeof(byCodeUnit("hello")) == string));
*b1e83836Smrg        static assert(is(typeof(byCodeUnit("hello"w)) == wstring));
*b1e83836Smrg    }
181254a7Smrg    static assert(is(typeof(byCodeUnit("hello"d)) == dstring));
181254a7Smrg
181254a7Smrg    static assert(!__traits(compiles, byCodeUnit((char[5]).init)));
181254a7Smrg    static assert(!__traits(compiles, byCodeUnit((wchar[5]).init)));
181254a7Smrg    static assert(!__traits(compiles, byCodeUnit((dchar[5]).init)));
181254a7Smrg
181254a7Smrg    enum SEnum : char[5] { a = "hello" }
181254a7Smrg    enum WSEnum : wchar[5] { a = "hello"w }
181254a7Smrg    enum DSEnum : dchar[5] { a = "hello"d }
181254a7Smrg
181254a7Smrg    static assert(!__traits(compiles, byCodeUnit(SEnum.a)));
181254a7Smrg    static assert(!__traits(compiles, byCodeUnit(WSEnum.a)));
181254a7Smrg    static assert(!__traits(compiles, byCodeUnit(DSEnum.a)));
181254a7Smrg}
181254a7Smrg
181254a7Smrg/****************************
*b1e83836Smrg * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
*b1e83836Smrg * of characters by char, wchar, or dchar.
181254a7Smrg * These aliases simply forward to $(LREF byUTF) with the
181254a7Smrg * corresponding C argument.
181254a7Smrg *
181254a7Smrg * Params:
181254a7Smrg *      r = input range of characters, or array of characters
181254a7Smrg */
181254a7Smrgalias byChar = byUTF!char;
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrgalias byWchar = byUTF!wchar;
181254a7Smrg
181254a7Smrg/// Ditto
181254a7Smrgalias byDchar = byUTF!dchar;
181254a7Smrg
181254a7Smrg@safe pure nothrow @nogc unittest
181254a7Smrg{
181254a7Smrg  {
181254a7Smrg    char[5] s;
181254a7Smrg    int i;
181254a7Smrg    foreach (c; "hello".byChar.byChar())
181254a7Smrg    {
181254a7Smrg        //writefln("[%d] '%c'", i, c);
181254a7Smrg        s[i++] = c;
181254a7Smrg    }
181254a7Smrg    assert(s == "hello");
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    char[5+2+3+4+3+3] s;
181254a7Smrg    int i;
181254a7Smrg    dchar[10] a;
181254a7Smrg    a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
181254a7Smrg    a[8] = 0xD800;   // invalid
181254a7Smrg    a[9] = cast(dchar) 0x110000; // invalid
181254a7Smrg    foreach (c; a[].byChar())
181254a7Smrg    {
181254a7Smrg        //writefln("[%d] '%c'", i, c);
181254a7Smrg        s[i++] = c;
181254a7Smrg    }
181254a7Smrg    assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD");
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    auto r = "hello"w.byChar();
181254a7Smrg    r.popFront();
181254a7Smrg    r.popFront();
181254a7Smrg    assert(r.front == 'l');
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    auto r = "hello"d.byChar();
181254a7Smrg    r.popFront();
181254a7Smrg    r.popFront();
181254a7Smrg    assert(r.front == 'l');
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    auto r = "hello"d.byChar();
181254a7Smrg    assert(isForwardRange!(typeof(r)));
181254a7Smrg    auto s = r.save;
181254a7Smrg    r.popFront();
181254a7Smrg    assert(s.front == 'h');
181254a7Smrg  }
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe pure nothrow @nogc unittest
181254a7Smrg{
181254a7Smrg  {
181254a7Smrg    wchar[11] s;
181254a7Smrg    int i;
181254a7Smrg    dchar[10] a;
181254a7Smrg    a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d;
181254a7Smrg    a[8] = 0xD800;   // invalid
181254a7Smrg    a[9] = cast(dchar) 0x110000; // invalid
181254a7Smrg    foreach (c; a[].byWchar())
181254a7Smrg    {
181254a7Smrg        //writefln("[%d] '%c' x%x", i, c, c);
181254a7Smrg        s[i++] = c;
181254a7Smrg    }
181254a7Smrg    foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w)
181254a7Smrg    {
181254a7Smrg        //writefln("[%d] '%c' x%x", j, c, c);
181254a7Smrg    }
181254a7Smrg    assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w);
181254a7Smrg  }
181254a7Smrg
181254a7Smrg  {
181254a7Smrg    auto r = "hello".byWchar();
181254a7Smrg    r.popFront();
181254a7Smrg    r.popFront();
181254a7Smrg    assert(r.front == 'l');
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    auto r = "hello"d.byWchar();
181254a7Smrg    r.popFront();
181254a7Smrg    r.popFront();
181254a7Smrg    assert(r.front == 'l');
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    auto r = "hello"d.byWchar();
181254a7Smrg    assert(isForwardRange!(typeof(r)));
181254a7Smrg    auto s = r.save;
181254a7Smrg    r.popFront();
181254a7Smrg    assert(s.front == 'h');
181254a7Smrg  }
181254a7Smrg}
181254a7Smrg
181254a7Smrg@safe pure nothrow @nogc unittest
181254a7Smrg{
181254a7Smrg  {
181254a7Smrg    dchar[9] s;
181254a7Smrg    int i;
181254a7Smrg    string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences
181254a7Smrg    foreach (c; a.byDchar())
181254a7Smrg    {
181254a7Smrg        s[i++] = c;
181254a7Smrg    }
181254a7Smrg    assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d);
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    foreach (s; invalidUTFstrings!char())
181254a7Smrg    {
181254a7Smrg        auto r = s.byDchar();
181254a7Smrg        assert(!r.empty);
181254a7Smrg        assert(r.front == r.front);
181254a7Smrg        dchar c = r.front;
181254a7Smrg        assert(c == replacementDchar);
181254a7Smrg    }
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    auto r = "hello".byDchar();
181254a7Smrg    r.popFront();
181254a7Smrg    r.popFront();
181254a7Smrg    assert(r.front == 'l');
181254a7Smrg  }
181254a7Smrg
181254a7Smrg  {
181254a7Smrg    dchar[8] s;
181254a7Smrg    int i;
181254a7Smrg    wstring a = "hello\u07FF\uD7FF\U0010FFFF"w;
181254a7Smrg    foreach (c; a.byDchar())
181254a7Smrg    {
181254a7Smrg        //writefln("[%d] '%c' x%x", i, c, c);
181254a7Smrg        s[i++] = c;
181254a7Smrg    }
181254a7Smrg    assert(s == "hello\u07FF\uD7FF\U0010FFFF"d);
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    foreach (s; invalidUTFstrings!wchar())
181254a7Smrg    {
181254a7Smrg        auto r = s.byDchar();
181254a7Smrg        assert(!r.empty);
181254a7Smrg        assert(r.front == r.front);
181254a7Smrg        dchar c = r.front;
181254a7Smrg        assert(c == replacementDchar);
181254a7Smrg    }
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    wchar[2] ws;
181254a7Smrg    ws[0] = 0xD800;
181254a7Smrg    ws[1] = 0xDD00;             // correct surrogate pair
181254a7Smrg    auto r = ws[].byDchar();
181254a7Smrg    assert(!r.empty);
181254a7Smrg    assert(r.front == r.front);
181254a7Smrg    dchar c = r.front;
181254a7Smrg    assert(c == '\U00010100');
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    auto r = "hello"w.byDchar();
181254a7Smrg    r.popFront();
181254a7Smrg    r.popFront();
181254a7Smrg    assert(r.front == 'l');
181254a7Smrg  }
181254a7Smrg
181254a7Smrg  {
181254a7Smrg    dchar[5] s;
181254a7Smrg    int i;
181254a7Smrg    dstring a = "hello"d;
181254a7Smrg    foreach (c; a.byDchar.byDchar())
181254a7Smrg    {
181254a7Smrg        //writefln("[%d] '%c' x%x", i, c, c);
181254a7Smrg        s[i++] = c;
181254a7Smrg    }
181254a7Smrg    assert(s == "hello"d);
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    auto r = "hello".byDchar();
181254a7Smrg    assert(isForwardRange!(typeof(r)));
181254a7Smrg    auto s = r.save;
181254a7Smrg    r.popFront();
181254a7Smrg    assert(s.front == 'h');
181254a7Smrg  }
181254a7Smrg  {
181254a7Smrg    auto r = "hello"w.byDchar();
181254a7Smrg    assert(isForwardRange!(typeof(r)));
181254a7Smrg    auto s = r.save;
181254a7Smrg    r.popFront();
181254a7Smrg    assert(s.front == 'h');
181254a7Smrg  }
181254a7Smrg}
181254a7Smrg
181254a7Smrg// test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar,
181254a7Smrg// which needs to support ranges with and without those attributes
181254a7Smrg
181254a7Smrgpure @safe nothrow @nogc unittest
181254a7Smrg{
181254a7Smrg    dchar[5] s = "hello"d;
181254a7Smrg    foreach (c; s[].byChar())  { }
181254a7Smrg    foreach (c; s[].byWchar()) { }
181254a7Smrg    foreach (c; s[].byDchar()) { }
181254a7Smrg}
181254a7Smrg
*b1e83836Smrgversion (StdUnittest)
*b1e83836Smrgprivate int impureVariable;
181254a7Smrg
181254a7Smrg@system unittest
181254a7Smrg{
181254a7Smrg    static struct ImpureThrowingSystemRange(Char)
181254a7Smrg    {
181254a7Smrg        @property bool empty() const { return true; }
181254a7Smrg        @property Char front() const { return Char.init; }
181254a7Smrg        void popFront()
181254a7Smrg        {
181254a7Smrg            impureVariable++;
181254a7Smrg            throw new Exception("only for testing nothrow");
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    foreach (Char; AliasSeq!(char, wchar, dchar))
181254a7Smrg    {
181254a7Smrg        ImpureThrowingSystemRange!Char range;
181254a7Smrg        foreach (c; range.byChar())  { }
181254a7Smrg        foreach (c; range.byWchar()) { }
181254a7Smrg        foreach (c; range.byDchar()) { }
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg/****************************
*b1e83836Smrg * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
*b1e83836Smrg * of characters by char type `C` by encoding the elements of the range.
181254a7Smrg *
*b1e83836Smrg * UTF sequences that cannot be converted to the specified encoding are either
181254a7Smrg * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution"
*b1e83836Smrg * of the Unicode Standard 6.2 or result in a thrown UTFException.
*b1e83836Smrg *  Hence byUTF is not symmetric.
181254a7Smrg * This algorithm is lazy, and does not allocate memory.
181254a7Smrg * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
181254a7Smrg * `r` parameter.
181254a7Smrg *
181254a7Smrg * Params:
181254a7Smrg *      C = `char`, `wchar`, or `dchar`
*b1e83836Smrg *      useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`,
*b1e83836Smrg *                            UseReplacementDchar.no means throw `UTFException` for invalid UTF
*b1e83836Smrg *
*b1e83836Smrg * Throws:
*b1e83836Smrg *      `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.yes`
*b1e83836Smrg *
*b1e83836Smrg * GC:
*b1e83836Smrg *      Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.no`
181254a7Smrg *
181254a7Smrg * Returns:
*b1e83836Smrg *      A bidirectional range if `R` is a bidirectional range and not auto-decodable,
*b1e83836Smrg *      as defined by $(REF isAutodecodableString, std, traits).
*b1e83836Smrg *
*b1e83836Smrg *      A forward range if `R` is a forward range and not auto-decodable.
181254a7Smrg *
181254a7Smrg *      Or, if `R` is a range and it is auto-decodable and
181254a7Smrg *      `is(ElementEncodingType!typeof(r) == C)`, then the range is passed
181254a7Smrg *      to $(LREF byCodeUnit).
181254a7Smrg *
181254a7Smrg *      Otherwise, an input range of characters.
181254a7Smrg */
*b1e83836Smrgtemplate byUTF(C, UseReplacementDchar useReplacementDchar = Yes.useReplacementDchar)
181254a7Smrgif (isSomeChar!C)
181254a7Smrg{
*b1e83836Smrg    static if (is(immutable C == immutable UC, UC) && !is(C == UC))
*b1e83836Smrg        alias byUTF = byUTF!UC;
181254a7Smrg    else:
181254a7Smrg
181254a7Smrg    auto ref byUTF(R)(R r)
181254a7Smrg        if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
181254a7Smrg    {
181254a7Smrg        return byUTF(r.byCodeUnit());
181254a7Smrg    }
181254a7Smrg
181254a7Smrg    auto ref byUTF(R)(R r)
181254a7Smrg        if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R))
181254a7Smrg    {
*b1e83836Smrg        static if (is(immutable ElementEncodingType!R == immutable RC, RC) && is(RC == C))
181254a7Smrg        {
181254a7Smrg            return r.byCodeUnit();
181254a7Smrg        }
*b1e83836Smrg        else static if (is(C == dchar))
*b1e83836Smrg        {
*b1e83836Smrg            static struct Result
*b1e83836Smrg            {
*b1e83836Smrg                enum Empty = uint.max;  // range is empty or just constructed
*b1e83836Smrg
*b1e83836Smrg                this(return scope R r)
*b1e83836Smrg                {
*b1e83836Smrg                    this.r = r;
*b1e83836Smrg                }
*b1e83836Smrg
*b1e83836Smrg                this(return scope R r, uint buff)
*b1e83836Smrg                {
*b1e83836Smrg                    this.r = r;
*b1e83836Smrg                    this.buff = buff;
*b1e83836Smrg                }
*b1e83836Smrg
*b1e83836Smrg                static if (isBidirectionalRange!R)
*b1e83836Smrg                {
*b1e83836Smrg                    this(return scope R r, uint frontBuff, uint backBuff)
*b1e83836Smrg                    {
*b1e83836Smrg                        this.r = r;
*b1e83836Smrg                        this.buff = frontBuff;
*b1e83836Smrg                        this.backBuff = backBuff;
*b1e83836Smrg                    }
*b1e83836Smrg                }
*b1e83836Smrg
*b1e83836Smrg                @property bool empty()
*b1e83836Smrg                {
*b1e83836Smrg                    static if (isBidirectionalRange!R)
*b1e83836Smrg                        return buff == Empty && backBuff == Empty && r.empty;
*b1e83836Smrg                    else
*b1e83836Smrg                        return buff == Empty && r.empty;
*b1e83836Smrg                }
*b1e83836Smrg
*b1e83836Smrg                @property dchar front() scope // 'scope' required by call to decodeFront() below
*b1e83836Smrg                {
*b1e83836Smrg                    if (buff == Empty)
*b1e83836Smrg                    {
*b1e83836Smrg                        auto c = r.front;
*b1e83836Smrg
*b1e83836Smrg                        static if (is(RC == wchar))
*b1e83836Smrg                            enum firstMulti = 0xD800; // First high surrogate.
*b1e83836Smrg                        else
*b1e83836Smrg                            enum firstMulti = 0x80; // First non-ASCII.
*b1e83836Smrg                        if (c < firstMulti)
*b1e83836Smrg                        {
*b1e83836Smrg                            r.popFront;
*b1e83836Smrg                            buff = cast(dchar) c;
*b1e83836Smrg                        }
*b1e83836Smrg                        else
*b1e83836Smrg                        {
*b1e83836Smrg                            buff = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
*b1e83836Smrg                        }
*b1e83836Smrg                    }
*b1e83836Smrg                    return cast(dchar) buff;
*b1e83836Smrg                }
*b1e83836Smrg
*b1e83836Smrg                void popFront()
*b1e83836Smrg                {
*b1e83836Smrg                    if (buff == Empty)
*b1e83836Smrg                        front();
*b1e83836Smrg                    buff = Empty;
*b1e83836Smrg                }
*b1e83836Smrg
*b1e83836Smrg                static if (isForwardRange!R)
*b1e83836Smrg                {
*b1e83836Smrg                    @property auto save()
*b1e83836Smrg                    {
*b1e83836Smrg                        static if (isBidirectionalRange!R)
*b1e83836Smrg                        {
*b1e83836Smrg                            return Result(r.save, buff, backBuff);
*b1e83836Smrg                        }
*b1e83836Smrg                        else
*b1e83836Smrg                        {
*b1e83836Smrg                            return Result(r.save, buff);
*b1e83836Smrg                        }
*b1e83836Smrg                    }
*b1e83836Smrg                }
*b1e83836Smrg
*b1e83836Smrg                static if (isBidirectionalRange!R)
*b1e83836Smrg                {
*b1e83836Smrg                    @property dchar back() scope // 'scope' required by call to decodeBack() below
*b1e83836Smrg                    {
*b1e83836Smrg                        if (backBuff != Empty)
*b1e83836Smrg                            return cast(dchar) backBuff;
*b1e83836Smrg
*b1e83836Smrg                        auto c = r.back;
*b1e83836Smrg                        static if (is(RC == wchar))
*b1e83836Smrg                            enum firstMulti = 0xD800; // First high surrogate.
*b1e83836Smrg                        else
*b1e83836Smrg                            enum firstMulti = 0x80; // First non-ASCII.
*b1e83836Smrg                        if (c < firstMulti)
*b1e83836Smrg                        {
*b1e83836Smrg                            r.popBack;
*b1e83836Smrg                            backBuff = cast(dchar) c;
*b1e83836Smrg                        }
*b1e83836Smrg                        else
*b1e83836Smrg                        {
*b1e83836Smrg                            backBuff = () @trusted { return decodeBack!useReplacementDchar(r); }();
*b1e83836Smrg                        }
*b1e83836Smrg                        return cast(dchar) backBuff;
*b1e83836Smrg
*b1e83836Smrg                    }
*b1e83836Smrg
*b1e83836Smrg                    void popBack()
*b1e83836Smrg                    {
*b1e83836Smrg                        if (backBuff == Empty)
*b1e83836Smrg                            back();
*b1e83836Smrg                        backBuff = Empty;
*b1e83836Smrg                    }
*b1e83836Smrg                }
*b1e83836Smrg
*b1e83836Smrg            private:
*b1e83836Smrg
*b1e83836Smrg                R r;
*b1e83836Smrg                uint buff = Empty;      // one character lookahead buffer
*b1e83836Smrg                static if (isBidirectionalRange!R)
*b1e83836Smrg                    uint backBuff = Empty;
*b1e83836Smrg            }
*b1e83836Smrg
*b1e83836Smrg            return Result(r);
*b1e83836Smrg        }
181254a7Smrg        else
181254a7Smrg        {
181254a7Smrg            static struct Result
181254a7Smrg            {
*b1e83836Smrg                this(return scope R r)
*b1e83836Smrg                {
*b1e83836Smrg                    this.r = r;
*b1e83836Smrg                }
*b1e83836Smrg
*b1e83836Smrg                this(return scope R r, ushort pos, ushort fill, C[4 / C.sizeof] buf)
*b1e83836Smrg                {
*b1e83836Smrg                    this.r = r;
*b1e83836Smrg                    this.pos = pos;
*b1e83836Smrg                    this.fill = fill;
*b1e83836Smrg                    this.buf = buf;
*b1e83836Smrg                }
*b1e83836Smrg
*b1e83836Smrg                static if (isBidirectionalRange!R)
*b1e83836Smrg                {
*b1e83836Smrg                    this(return scope R r, ushort frontPos, ushort frontFill,
*b1e83836Smrg                         ushort backPos, ushort backFill, C[4 / C.sizeof] buf)
*b1e83836Smrg                    {
*b1e83836Smrg                        this.r = r;
*b1e83836Smrg                        this.pos = frontPos;
*b1e83836Smrg                        this.fill = frontFill;
*b1e83836Smrg                        this.backPos = backPos;
*b1e83836Smrg                        this.backFill = backFill;
*b1e83836Smrg                        this.buf = buf;
*b1e83836Smrg                    }
*b1e83836Smrg                }
*b1e83836Smrg
181254a7Smrg                @property bool empty()
181254a7Smrg                {
*b1e83836Smrg                    static if (isBidirectionalRange!R)
*b1e83836Smrg                        return pos == fill && backPos == backFill && r.empty;
*b1e83836Smrg                    else
181254a7Smrg                        return pos == fill && r.empty;
181254a7Smrg                }
181254a7Smrg
181254a7Smrg                @property auto front() scope // 'scope' required by call to decodeFront() below
181254a7Smrg                {
181254a7Smrg                    if (pos == fill)
181254a7Smrg                    {
181254a7Smrg                        pos = 0;
181254a7Smrg                        auto c = r.front;
181254a7Smrg
*b1e83836Smrg                        static if (C.sizeof >= 2 && RC.sizeof >= 2)
*b1e83836Smrg                            enum firstMulti = 0xD800; // First high surrogate.
*b1e83836Smrg                        else
*b1e83836Smrg                            enum firstMulti = 0x80; // First non-ASCII.
*b1e83836Smrg                        if (c < firstMulti)
181254a7Smrg                        {
181254a7Smrg                            fill = 1;
181254a7Smrg                            r.popFront;
181254a7Smrg                            buf[pos] = cast(C) c;
181254a7Smrg                        }
181254a7Smrg                        else
181254a7Smrg                        {
181254a7Smrg                            static if (is(RC == dchar))
181254a7Smrg                            {
181254a7Smrg                                r.popFront;
181254a7Smrg                                dchar dc = c;
181254a7Smrg                            }
181254a7Smrg                            else
*b1e83836Smrg                                dchar dc = () @trusted { return decodeFront!(useReplacementDchar)(r); }();
*b1e83836Smrg                            fill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
181254a7Smrg                        }
181254a7Smrg                    }
181254a7Smrg                    return buf[pos];
181254a7Smrg                }
181254a7Smrg
181254a7Smrg                void popFront()
181254a7Smrg                {
181254a7Smrg                    if (pos == fill)
181254a7Smrg                        front;
181254a7Smrg                    ++pos;
181254a7Smrg                }
181254a7Smrg
181254a7Smrg                static if (isForwardRange!R)
181254a7Smrg                {
*b1e83836Smrg                    @property auto save()
181254a7Smrg                    {
*b1e83836Smrg                        static if (isBidirectionalRange!R)
*b1e83836Smrg                        {
*b1e83836Smrg                            return Result(r.save, pos, fill, backPos, backFill, buf);
*b1e83836Smrg                        }
*b1e83836Smrg                        else
*b1e83836Smrg                        {
*b1e83836Smrg                            return Result(r.save, pos, fill, buf);
*b1e83836Smrg                        }
*b1e83836Smrg                    }
*b1e83836Smrg                }
*b1e83836Smrg
*b1e83836Smrg                static if (isBidirectionalRange!R)
*b1e83836Smrg                {
*b1e83836Smrg                    @property auto back() scope // 'scope' required by call to decodeBack() below
*b1e83836Smrg                    {
*b1e83836Smrg                        if (backPos != backFill)
*b1e83836Smrg                            return buf[cast(ushort) (backFill - backPos - 1)];
*b1e83836Smrg
*b1e83836Smrg                        backPos = 0;
*b1e83836Smrg                        auto c = r.back;
*b1e83836Smrg                        static if (C.sizeof >= 2 && RC.sizeof >= 2)
*b1e83836Smrg                            enum firstMulti = 0xD800; // First high surrogate.
*b1e83836Smrg                        else
*b1e83836Smrg                            enum firstMulti = 0x80; // First non-ASCII.
*b1e83836Smrg                        if (c < firstMulti)
*b1e83836Smrg                        {
*b1e83836Smrg                            backFill = 1;
*b1e83836Smrg                            r.popBack;
*b1e83836Smrg                            buf[cast(ushort) (backFill - backPos - 1)] = cast(C) c;
*b1e83836Smrg                        }
*b1e83836Smrg                        else
*b1e83836Smrg                        {
*b1e83836Smrg                            static if (is(RC == dchar))
*b1e83836Smrg                            {
*b1e83836Smrg                                r.popBack;
*b1e83836Smrg                                dchar dc = c;
*b1e83836Smrg                            }
*b1e83836Smrg                            else
*b1e83836Smrg                                dchar dc = () @trusted { return decodeBack!(useReplacementDchar)(r); }();
*b1e83836Smrg                            backFill = cast(ushort) encode!(useReplacementDchar)(buf, dc);
*b1e83836Smrg                        }
*b1e83836Smrg                        return buf[cast(ushort) (backFill - backPos - 1)];
*b1e83836Smrg                    }
*b1e83836Smrg
*b1e83836Smrg                    void popBack()
*b1e83836Smrg                    {
*b1e83836Smrg                        if (backPos == backFill)
*b1e83836Smrg                            back;
*b1e83836Smrg                        ++backPos;
181254a7Smrg                    }
181254a7Smrg                }
181254a7Smrg
181254a7Smrg            private:
181254a7Smrg
181254a7Smrg                R r;
181254a7Smrg                ushort pos, fill;
*b1e83836Smrg                static if (isBidirectionalRange!R)
*b1e83836Smrg                    ushort backPos, backFill;
*b1e83836Smrg                C[4 / C.sizeof] buf = void;
181254a7Smrg            }
181254a7Smrg
181254a7Smrg            return Result(r);
181254a7Smrg        }
181254a7Smrg    }
181254a7Smrg}
181254a7Smrg
181254a7Smrg///
181254a7Smrg@safe pure nothrow unittest
181254a7Smrg{
181254a7Smrg    import std.algorithm.comparison : equal;
181254a7Smrg
181254a7Smrg    // hellö as a range of `char`s, which are UTF-8
*b1e83836Smrg    assert("hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6]));
181254a7Smrg
181254a7Smrg    // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
*b1e83836Smrg    assert("hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö']));
181254a7Smrg
181254a7Smrg    // �� is four code units in UTF-8, two in UTF-16, and one in UTF-32
*b1e83836Smrg    assert("��".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7]));
*b1e83836Smrg    assert("��".byUTF!wchar().equal([0xD801, 0xDC37]));
*b1e83836Smrg    assert("��".byUTF!dchar().equal([0x00010437]));
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe unittest
*b1e83836Smrg{
*b1e83836Smrg    import std.algorithm.comparison : equal;
*b1e83836Smrg    import std.exception : assertThrown;
*b1e83836Smrg
*b1e83836Smrg    assert("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.yes).equal("hello\uFFFDetty"));
*b1e83836Smrg    assertThrown!UTFException("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.no).equal("hello betty"));
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrg@safe unittest
*b1e83836Smrg{
*b1e83836Smrg    {
*b1e83836Smrg        wchar[] s = ['a', 'b', 0x219];
*b1e83836Smrg        auto r = s.byUTF!char;
*b1e83836Smrg        assert(isBidirectionalRange!(typeof(r)));
*b1e83836Smrg        assert(r.back == 0x99);
*b1e83836Smrg        r.popBack;
*b1e83836Smrg        assert(r.back == 0xc8);
*b1e83836Smrg        r.popBack;
*b1e83836Smrg        assert(r.back == 'b');
*b1e83836Smrg
*b1e83836Smrg    }
*b1e83836Smrg
*b1e83836Smrg    {
*b1e83836Smrg        wchar[] s = ['a', 'b', 0x219];
*b1e83836Smrg        auto r = s.byUTF!wchar;
*b1e83836Smrg        uint i;
*b1e83836Smrg        assert(isBidirectionalRange!(typeof(r)));
*b1e83836Smrg        assert(r.back == 0x219);
*b1e83836Smrg        r.popBack;
*b1e83836Smrg        assert(r.back == 'b');
*b1e83836Smrg    }
*b1e83836Smrg
*b1e83836Smrg    {
*b1e83836Smrg        wchar[] s = ['a', 'b', 0x219];
*b1e83836Smrg        auto r = s.byUTF!dchar;
*b1e83836Smrg        assert(isBidirectionalRange!(typeof(r)));
*b1e83836Smrg        assert(r.back == 0x219);
*b1e83836Smrg        r.popBack;
*b1e83836Smrg        assert(r.back == 'b');
*b1e83836Smrg    }
*b1e83836Smrg
*b1e83836Smrg    {
*b1e83836Smrg        dchar[] s = ['��', '��'];
*b1e83836Smrg        auto r = s.byUTF!wchar;
*b1e83836Smrg        assert(r.back == 0xde01);
*b1e83836Smrg        r.popBack;
*b1e83836Smrg        assert(r.back == 0xd83d);
*b1e83836Smrg        r.popBack;
*b1e83836Smrg        assert(r.back == 0xdc37);
*b1e83836Smrg        r.popBack;
*b1e83836Smrg        assert(r.back == 0xd801);
*b1e83836Smrg    }
*b1e83836Smrg
*b1e83836Smrg    {
*b1e83836Smrg        dchar[] s = ['��', '��'];
*b1e83836Smrg        auto r = s.byUTF!char;
*b1e83836Smrg        char[] res;
*b1e83836Smrg        while (!r.empty)
*b1e83836Smrg        {
*b1e83836Smrg            res ~= r.back;
*b1e83836Smrg            r.popBack;
*b1e83836Smrg        }
*b1e83836Smrg        import std.algorithm.comparison : equal;
*b1e83836Smrg        assert(res.equal([0x81, 0x98, 0x9f, 0xf0, 0xb7, 0x90, 0x90, 0xf0]));
*b1e83836Smrg    }
*b1e83836Smrg
*b1e83836Smrg    {
*b1e83836Smrg        dchar[] res;
*b1e83836Smrg        auto r = ['a', 'b', 'c', 'd', 'e'].byUTF!dchar;
*b1e83836Smrg        while (!r.empty)
*b1e83836Smrg        {
*b1e83836Smrg            res ~= r.back;
*b1e83836Smrg            r.popBack;
*b1e83836Smrg        }
*b1e83836Smrg        import std.algorithm.comparison : equal;
*b1e83836Smrg        assert(res.equal(['e', 'd', 'c', 'b', 'a']));
*b1e83836Smrg    }
*b1e83836Smrg
*b1e83836Smrg    {
*b1e83836Smrg        //testing the save() function
*b1e83836Smrg        wchar[] s = ['Ă','ț'];
*b1e83836Smrg
*b1e83836Smrg        auto rc = s.byUTF!char;
*b1e83836Smrg        rc.popBack;
*b1e83836Smrg        auto rcCopy = rc.save;
*b1e83836Smrg        assert(rc.back == rcCopy.back);
*b1e83836Smrg        assert(rcCopy.back == 0xc8);
*b1e83836Smrg
*b1e83836Smrg        auto rd = s.byUTF!dchar;
*b1e83836Smrg        rd.popBack;
*b1e83836Smrg        auto rdCopy = rd.save;
*b1e83836Smrg        assert(rd.back == rdCopy.back);
*b1e83836Smrg        assert(rdCopy.back == 'Ă');
*b1e83836Smrg    }
*b1e83836Smrg}
*b1e83836Smrg
*b1e83836Smrg///
*b1e83836Smrg@safe pure nothrow unittest
*b1e83836Smrg{
*b1e83836Smrg    import std.range.primitives;
*b1e83836Smrg    wchar[] s = ['ă', 'î'];
*b1e83836Smrg
*b1e83836Smrg    auto rc = s.byUTF!char;
*b1e83836Smrg    static assert(isBidirectionalRange!(typeof(rc)));
*b1e83836Smrg    assert(rc.back == 0xae);
*b1e83836Smrg    rc.popBack;
*b1e83836Smrg    assert(rc.back == 0xc3);
*b1e83836Smrg    rc.popBack;
*b1e83836Smrg    assert(rc.back == 0x83);
*b1e83836Smrg    rc.popBack;
*b1e83836Smrg    assert(rc.back == 0xc4);
*b1e83836Smrg
*b1e83836Smrg    auto rw = s.byUTF!wchar;
*b1e83836Smrg    static assert(isBidirectionalRange!(typeof(rw)));
*b1e83836Smrg    assert(rw.back == 'î');
*b1e83836Smrg    rw.popBack;
*b1e83836Smrg    assert(rw.back == 'ă');
*b1e83836Smrg
*b1e83836Smrg    auto rd = s.byUTF!dchar;
*b1e83836Smrg    static assert(isBidirectionalRange!(typeof(rd)));
*b1e83836Smrg    assert(rd.back == 'î');
*b1e83836Smrg    rd.popBack;
*b1e83836Smrg    assert(rd.back == 'ă');
181254a7Smrg}