utf.d - OpenGrok cross reference for /netbsd-src/external/gpl3/gcc.old/dist/libphobos/src/std/utf.d

Lines Matching +full:no +full:- +full:cast
4     Encode and decode UTF-8, UTF-16 and UTF-32 strings.
53         $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
55     Copyright: Copyright Digital Mars 2000 - 2012.
66 import std.typecons;   // Flag, Yes, No
112              * it is const-compatible.  in toString()
115             auto e = () @trusted { return cast(Exception) super; } ();  in toString()
134             result ~= " - ";  in toString()
172               cast(char)(0xF0 | (y >> 18)),
173               cast(char)(0x80 | ((y >> 12) & 0x3F)),
174               cast(char)(0x80 | ((y >> 6) & 0x3F)),
175               cast(char)(0x80 | (y & 0x3F))
178               cast(char)(0xF8 | 3),     // 5 byte encoding
179               cast(char)(0x80 | 3),
180               cast(char)(0x80 | 3),
181               cast(char)(0x80 | 3),
182               cast(char)(0x80 | 3),
185               cast(char)(0xFC | 3),     // 6 byte encoding
186               cast(char)(0x80 | 3),
187               cast(char)(0x80 | 3),
188               cast(char)(0x80 | 3),
189               cast(char)(0x80 | 3),
190               cast(char)(0x80 | 3),
201               cast(wchar) 0xDC00,
204               cast(wchar) 0xDFFF,
207               cast(wchar) 0xDBFF,
208               cast(wchar) 0xDBFF,
211               cast(wchar) 0xDBFF,
212               cast(wchar) 0xE000,
215               cast(wchar) 0xD800,
225             [ cast(dchar) 0x110000 ],
226             [ cast(dchar) 0x00D800 ],
227             [ cast(dchar) 0x00DFFF ],
261     assert( isValidDchar(cast(dchar)'a') == true);
262     assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
264     assert(!isValidDchar(cast(dchar) 0x00D800));
265     assert(!isValidDchar(cast(dchar) 0x00DBFF));
266     assert(!isValidDchar(cast(dchar) 0x00DC00));
267     assert(!isValidDchar(cast(dchar) 0x00DFFF));
268     assert( isValidDchar(cast(dchar) 0x00FFFE));
269     assert( isValidDchar(cast(dchar) 0x00FFFF));
270     assert( isValidDchar(cast(dchar) 0x01FFFF));
271     assert( isValidDchar(cast(dchar) 0x10FFFF));
272     assert(!isValidDchar(cast(dchar) 0x110000));
287         The number of code units in the UTF sequence. For UTF-8, this is a
288 …value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) secti…
289         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
306         assert(index < str.length, "Past the end of the UTF-8 sequence");
336     immutable msbs = 7 - bsr((~uint(c)) & 0xFF);
338         throw new UTFException("Invalid UTF-8 sequence", index);
429         assert(index < str.length, "Past the end of the UTF-16 sequence");
445     assert(!str.empty, "UTF-16 sequence is empty");
523         assert(index < str.length, "Past the end of the UTF-32 sequence");
525         assert(!str.empty, "UTF-32 sequence is empty.");
606         The number of code units in the UTF sequence. For UTF-8, this is a
607 …value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) secti…
608         For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
615         $(D strideBack) will only analyze the element at $(D str[index - 1])
625         assert(index <= str.length, "Past the end of the UTF-8 sequence");
626     assert(index > 0, "Not the end of the UTF-8 sequence");
628     if ((str[index-1] & 0b1100_0000) != 0b1000_0000)
635             if ((str[index-i] & 0b1100_0000) != 0b1000_0000)
643             if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000)
662     assert(!str.empty, "Past the end of the UTF-8 sequence");
672     throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
742 //UTF-16 is self synchronizing: The length of strideBack can be found from
750         assert(index <= str.length, "Past the end of the UTF-16 sequence");
751     assert(index > 0, "Not the end of a UTF-16 sequence");
753     immutable c2 = str[index-1];
762     assert(!str.empty, "UTF-16 sequence is empty");
765         immutable c2 = str[$ - 1];
844         assert(index <= str.length, "Past the end of the UTF-32 sequence");
845     assert(index > 0, "Not the end of the UTF-32 sequence");
853     assert(!str.empty, "Empty UTF-32 sequence");
948                 throw new UTFException("Invalid UTF-8 sequence", index);
950                 throw new UTFException("Invalid UTF-16 sequence", index);
989         while (n--)
1022     well-formed, then a $(D UTFException) is thrown and $(D index) remains
1039         sequence and useReplacementDchar is $(D No.useReplacementDchar)
1041 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, r…
1060 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1097         sequence. If an exception is thrown, then there is no guarantee as to
1102 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1139 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1168 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1194         but there is no guarantee as to the value of $(D numCodeUnits) (when passed).
1196 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1209     if (str[$ - 1] < codeUnitLimit!S)
1212         immutable retval = str[$ - 1];
1213         str = str[0 .. $ - 1];
1219         immutable newLength = str.length - numCodeUnits;
1228 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1254             size_t index = str.length - numCodeUnits;
1266                 codeUnits[--i] = tmp.back;
1279 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1326 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacement…
1341     /* Dchar bitmask for different numbers of UTF-8 code units.
1343     alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1357         immutable length = str.length - index;
1387                return exception(pstr[0 .. length], "Invalid UTF-8 sequence");  in invalidUTF()
1394                 //error message for the (hopefully) rare case when an invalid UTF-8  in invalidUTF()
1397                return new UTFException("Invalid UTF-8 sequence");  in invalidUTF()
1476         if (!(fst & 0x80)) // no more bytes
1481             if ((d & ~bitMask[i - 1]) == 0)
1557 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacement…
1573         immutable length = str.length - index;
1611                 throw exception("surrogate UTF-16 high value past end of string");
1627                 throw exception("surrogate UTF-16 low value out of range");
1630             u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
1638             throw exception("unpaired surrogate UTF-16 value");
1645     return cast(dchar) u;
1674 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacement…
1691                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1704                 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1788         enforce(range.length == lenBefore - numCodeUnits,  in version()
1818             enforce(range.length == lenBefore - numCodeUnits,  in version()
1953         //Invalid UTF-8 sequence where the first code unit is valid.
1954         testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
1955         testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
1957         //Invalid UTF-8 sequence where the first code unit isn't valid.
1983         testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
1984         testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
1985         testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
1986         testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
1987         testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
1989         testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
1990         testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
1992         testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
1993         testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
2016         auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2017                       cast(wchar) 0x1400,
2018                       cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2019         testDecode(str, 0, cast(dchar) 0x10000, 2);
2020         testDecode(str, 2, cast(dchar) 0x1400, 3);
2021         testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2022         testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2023         testDecodeBack(str, cast(dchar) 0x1400, 1);
2024         testDecodeBack(str, cast(dchar) 0x10000, 2);
2039         testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2040         testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2041         testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2042         testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2043         testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2045         testBadDecode(S([cast(dchar) 0xD800]), 0);
2046         testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2047         testBadDecode(S([cast(dchar) 0x110000]), 0);
2049         testBadDecodeBack(S([cast(dchar) 0xD800]));
2050         testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2051         testBadDecodeBack(S([cast(dchar) 0x110000]));
2074         auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2078         testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2079         testDecodeBack(str, cast(dchar) 0x1400, 1);
2080         testDecodeBack(str, cast(dchar) 0x10000, 1);
2140 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2146         buf[0] = cast(char) c;
2152         buf[0] = cast(char)(0xC0 | (c >> 6));
2153         buf[1] = cast(char)(0x80 | (c & 0x3F));
2159             c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2163         buf[0] = cast(char)(0xE0 | (c >> 12));
2164         buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2165         buf[2] = cast(char)(0x80 | (c & 0x3F));
2171         buf[0] = cast(char)(0xF0 | (c >> 18));
2172         buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2173         buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2174         buf[3] = cast(char)(0x80 | (c & 0x3F));
2179     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2202     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2203     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2204     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2205     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2206     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2208     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2215 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2221 …   c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2225         buf[0] = cast(wchar) c;
2231         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2232         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2236     c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2255     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2256     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2257     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2258     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2259     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2261     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2268 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2272         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2293     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2294     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2295     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2296     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2297     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2299     assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2311 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2319         r ~= cast(char) c;
2329             buf[0] = cast(char)(0xC0 | (c >> 6));
2330             buf[1] = cast(char)(0x80 | (c & 0x3F));
2336 …              c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2340             buf[0] = cast(char)(0xE0 | (c >> 12));
2341             buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2342             buf[2] = cast(char)(0x80 | (c & 0x3F));
2348             buf[0] = cast(char)(0xF0 | (c >> 18));
2349             buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2350             buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2351             buf[3] = cast(char)(0x80 | (c & 0x3F));
2357             c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2372     encode(s, cast(dchar)'a');
2376     encode(s, cast(dchar)'\u00A9');
2381     encode(s, cast(dchar)'\u2260');
2406     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2407     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2408     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2409     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2410     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2413     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2419 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2427 …   c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2431         r ~= cast(wchar) c;
2438         buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2439         buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2445         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2467     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2468     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2469     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2470     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2471     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2474     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2480 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2484         c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2504     assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2505     assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2506     assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2507     assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2508     assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2511     encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2661     assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF
2667     assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00));
2668     assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00));
2677     Checks to see if $(D str) is well-formed unicode or not.
2680         $(D UTFException) if $(D str) is not well-formed.
2703  * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2709  *     A UTF-8 string
2711  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2724     // The ö is represented by two UTF-8 code units
2727     // �� is four code units in UTF-8
2744  * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
2750  *     A UTF-16 string
2752  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2765     // these graphemes are two code units in UTF-16 and one in UTF-32
2787  * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
2793  *     A UTF-32 string
2795  *     For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2827     Returns a C-style zero-terminated string equivalent to $(D str). $(D str)  in toUTFImpl()
2835     to allocate a new string - particularly when dealing with character types  in toUTFImpl()
2841     zero-terminated anymore. The most likely scenarios for that are if you  in toUTFImpl()
2842     append to $(D str) and no reallocation takes place or when $(D str) is a  in toUTFImpl()
2847     user-defined type with one declared right after the other) and that  in toUTFImpl()
2849     occur if you immediately use the zero-terminated string after calling  in toUTFImpl()
2851     Also, they are unlikely to occur even if you save the zero-terminated string  in toUTFImpl()
2853     However, if you save the zero-terminate string and want to be absolutely  in toUTFImpl()
2854     certain that the string stays zero-terminated, then simply append a  in toUTFImpl()
2886 //immutable(C)[] -> C*, const(C)*, or immutable(C)*
2901         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
2910             // Peek past end of str, if it's 0, no conversion necessary.
2919             if ((cast(size_t) p & 3) && *p == '\0')
2923         return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
2931 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
2936     //const(C)[] -> const(C)* or
2937     //C[] -> C* or const(C)*
2946             if ((cast(size_t) p & 3) && *p == '\0')
2953     //const(C)[] -> C* or immutable(C)* or
2954     //C[] -> immutable(C)*
2959         copy[0 .. $ - 1] = str[];
2960         copy[$ - 1] = '\0';
2962         auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; }  in trustedCast()
2970 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
2979     return () @trusted { return cast(P) retval.data.ptr; } ();
2998         temp[0 .. $ - 1] = s1[0 .. $];
2999         temp[$ - 1] = '\n';
3000         --temp.length;
3073     Encodes string $(D s) into UTF-16 and returns the encoded string.
3130     Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3133         $(D UTFException) if $(D str) is not well-formed.
3178         @property C back() { return _str[$ - 1]; }  in version()
3179         void popBack() { _str = _str[0 .. $ - 1]; }  in version()
3197         @property C back() { return _str[$ - 1]; }  in version()
3198         void popBack() { _str = _str[0 .. $ - 1]; }  in version()
3218         @property C back() { return _str[$ - 1]; }  in version()
3219         void popBack() { _str = _str[0 .. $ - 1]; }  in version()
3237         @property C back() { return _str[$ - 1]; }  in version()
3238         void popBack() { _str = _str[0 .. $ - 1]; }  in version()
3274  * are encoded with multiple code units. For example, the UTF-8 code units for
3283  *     If `r` is not an auto-decodable string (i.e. a narrow string or a
3284  *     user-defined type that implicits converts to a string type), then `r`
3288  *      not already a string) and wrapped in a random-access range where the
3295  *      `r` is returned, and no implicit conversion takes place.
3324             @property auto ref back() inout { return str[$ - 1]; }  in back()
3325             void popBack()                  { str = str[0 .. $-1]; }  in popBack()
3344         return cast(StringTypeOf!R) r;
3348         // byCodeUnit for ranges and dchar[] is a no-op
3374 /// `byCodeUnit` does no Unicode decoding
3381     string noel2 = "no\u00EBl"; // noël using a precomposed ë character
3382     // Because string is UTF-8, the code unit at index 2 is just
3477         assert(bcu.front == cast(char) 244);
3492         assert(bcu.front == cast(wchar) 56319);
3506         assert(bcu.front == cast(dchar) 1114104);
3522         assert(bcu.front == cast(char) 244);
3538         assert(bcu.front == cast(char) 244);
3710     a[9] = cast(dchar) 0x110000; // invalid
3747     a[9] = cast(dchar) 0x110000; // invalid
3919  * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
3926  *      A forward range if `R` is a range and not auto-decodable, as defined by
3930  *      Or, if `R` is a range and it is auto-decodable and
3978                             buf[pos] = cast(C) c;  in front()
3989                             fill = cast(ushort) encode!(Yes.useReplacementDchar)(buf, dc);  in front()
4032     // hellö as a range of `char`s, which are UTF-8
4035     // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4038     // �� is four code units in UTF-8, two in UTF-16, and one in UTF-32