Lines Matching +full:no +full:- +full:cast
4 Encode and decode UTF-8, UTF-16 and UTF-32 strings.
53 $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
55 Copyright: Copyright Digital Mars 2000 - 2012.
66 import std.typecons; // Flag, Yes, No
112 * it is const-compatible. in toString()
115 auto e = () @trusted { return cast(Exception) super; } (); in toString()
134 result ~= " - "; in toString()
172 cast(char)(0xF0 | (y >> 18)),
173 cast(char)(0x80 | ((y >> 12) & 0x3F)),
174 cast(char)(0x80 | ((y >> 6) & 0x3F)),
175 cast(char)(0x80 | (y & 0x3F))
178 cast(char)(0xF8 | 3), // 5 byte encoding
179 cast(char)(0x80 | 3),
180 cast(char)(0x80 | 3),
181 cast(char)(0x80 | 3),
182 cast(char)(0x80 | 3),
185 cast(char)(0xFC | 3), // 6 byte encoding
186 cast(char)(0x80 | 3),
187 cast(char)(0x80 | 3),
188 cast(char)(0x80 | 3),
189 cast(char)(0x80 | 3),
190 cast(char)(0x80 | 3),
201 cast(wchar) 0xDC00,
204 cast(wchar) 0xDFFF,
207 cast(wchar) 0xDBFF,
208 cast(wchar) 0xDBFF,
211 cast(wchar) 0xDBFF,
212 cast(wchar) 0xE000,
215 cast(wchar) 0xD800,
225 [ cast(dchar) 0x110000 ],
226 [ cast(dchar) 0x00D800 ],
227 [ cast(dchar) 0x00DFFF ],
261 assert( isValidDchar(cast(dchar)'a') == true);
262 assert( isValidDchar(cast(dchar) 0x1FFFFF) == false);
264 assert(!isValidDchar(cast(dchar) 0x00D800));
265 assert(!isValidDchar(cast(dchar) 0x00DBFF));
266 assert(!isValidDchar(cast(dchar) 0x00DC00));
267 assert(!isValidDchar(cast(dchar) 0x00DFFF));
268 assert( isValidDchar(cast(dchar) 0x00FFFE));
269 assert( isValidDchar(cast(dchar) 0x00FFFF));
270 assert( isValidDchar(cast(dchar) 0x01FFFF));
271 assert( isValidDchar(cast(dchar) 0x10FFFF));
272 assert(!isValidDchar(cast(dchar) 0x110000));
287 The number of code units in the UTF sequence. For UTF-8, this is a
288 …value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) secti…
289 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
306 assert(index < str.length, "Past the end of the UTF-8 sequence");
336 immutable msbs = 7 - bsr((~uint(c)) & 0xFF);
338 throw new UTFException("Invalid UTF-8 sequence", index);
429 assert(index < str.length, "Past the end of the UTF-16 sequence");
445 assert(!str.empty, "UTF-16 sequence is empty");
523 assert(index < str.length, "Past the end of the UTF-32 sequence");
525 assert(!str.empty, "UTF-32 sequence is empty.");
606 The number of code units in the UTF sequence. For UTF-8, this is a
607 …value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) secti…
608 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1.
615 $(D strideBack) will only analyze the element at $(D str[index - 1])
625 assert(index <= str.length, "Past the end of the UTF-8 sequence");
626 assert(index > 0, "Not the end of the UTF-8 sequence");
628 if ((str[index-1] & 0b1100_0000) != 0b1000_0000)
635 if ((str[index-i] & 0b1100_0000) != 0b1000_0000)
643 if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000)
662 assert(!str.empty, "Past the end of the UTF-8 sequence");
672 throw new UTFException("The last code unit is not the end of the UTF-8 sequence");
742 //UTF-16 is self synchronizing: The length of strideBack can be found from
750 assert(index <= str.length, "Past the end of the UTF-16 sequence");
751 assert(index > 0, "Not the end of a UTF-16 sequence");
753 immutable c2 = str[index-1];
762 assert(!str.empty, "UTF-16 sequence is empty");
765 immutable c2 = str[$ - 1];
844 assert(index <= str.length, "Past the end of the UTF-32 sequence");
845 assert(index > 0, "Not the end of the UTF-32 sequence");
853 assert(!str.empty, "Empty UTF-32 sequence");
948 throw new UTFException("Invalid UTF-8 sequence", index);
950 throw new UTFException("Invalid UTF-16 sequence", index);
989 while (n--)
1022 well-formed, then a $(D UTFException) is thrown and $(D index) remains
1039 sequence and useReplacementDchar is $(D No.useReplacementDchar)
1041 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, r…
1060 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1097 sequence. If an exception is thrown, then there is no guarantee as to
1102 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1139 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1168 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1194 but there is no guarantee as to the value of $(D numCodeUnits) (when passed).
1196 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1209 if (str[$ - 1] < codeUnitLimit!S)
1212 immutable retval = str[$ - 1];
1213 str = str[0 .. $ - 1];
1219 immutable newLength = str.length - numCodeUnits;
1228 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(
1254 size_t index = str.length - numCodeUnits;
1266 codeUnits[--i] = tmp.back;
1279 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str)
1326 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacement…
1341 /* Dchar bitmask for different numbers of UTF-8 code units.
1343 alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
1357 immutable length = str.length - index;
1387 return exception(pstr[0 .. length], "Invalid UTF-8 sequence"); in invalidUTF()
1394 //error message for the (hopefully) rare case when an invalid UTF-8 in invalidUTF()
1397 return new UTFException("Invalid UTF-8 sequence"); in invalidUTF()
1476 if (!(fst & 0x80)) // no more bytes
1481 if ((d & ~bitMask[i - 1]) == 0)
1557 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacement…
1573 immutable length = str.length - index;
1611 throw exception("surrogate UTF-16 high value past end of string");
1627 throw exception("surrogate UTF-16 low value out of range");
1630 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
1638 throw exception("unpaired surrogate UTF-16 value");
1645 return cast(dchar) u;
1674 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacement…
1691 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1704 throw new UTFException("Invalid UTF-32 value").setSequence(dc);
1788 enforce(range.length == lenBefore - numCodeUnits, in version()
1818 enforce(range.length == lenBefore - numCodeUnits, in version()
1953 //Invalid UTF-8 sequence where the first code unit is valid.
1954 testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3);
1955 testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3);
1957 //Invalid UTF-8 sequence where the first code unit isn't valid.
1983 testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1);
1984 testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2);
1985 testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2);
1986 testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
1987 testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
1989 testBadDecode(S([ cast(wchar) 0xD801 ]), 0);
1990 testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0);
1992 testBadDecodeBack(S([ cast(wchar) 0xD801 ]));
1993 testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ]));
2016 auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00,
2017 cast(wchar) 0x1400,
2018 cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]);
2019 testDecode(str, 0, cast(dchar) 0x10000, 2);
2020 testDecode(str, 2, cast(dchar) 0x1400, 3);
2021 testDecode(str, 3, cast(dchar) 0xB9DDE, 5);
2022 testDecodeBack(str, cast(dchar) 0xB9DDE, 2);
2023 testDecodeBack(str, cast(dchar) 0x1400, 1);
2024 testDecodeBack(str, cast(dchar) 0x10000, 2);
2039 testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1);
2040 testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1);
2041 testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1);
2042 testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1);
2043 testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1);
2045 testBadDecode(S([cast(dchar) 0xD800]), 0);
2046 testBadDecode(S([cast(dchar) 0xDFFE]), 0);
2047 testBadDecode(S([cast(dchar) 0x110000]), 0);
2049 testBadDecodeBack(S([cast(dchar) 0xD800]));
2050 testBadDecodeBack(S([cast(dchar) 0xDFFE]));
2051 testBadDecodeBack(S([cast(dchar) 0x110000]));
2074 auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]);
2078 testDecodeBack(str, cast(dchar) 0xB9DDE, 1);
2079 testDecodeBack(str, cast(dchar) 0x1400, 1);
2080 testDecodeBack(str, cast(dchar) 0x10000, 1);
2140 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2146 buf[0] = cast(char) c;
2152 buf[0] = cast(char)(0xC0 | (c >> 6));
2153 buf[1] = cast(char)(0x80 | (c & 0x3F));
2159 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2163 buf[0] = cast(char)(0xE0 | (c >> 12));
2164 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2165 buf[2] = cast(char)(0x80 | (c & 0x3F));
2171 buf[0] = cast(char)(0xF0 | (c >> 18));
2172 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2173 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2174 buf[3] = cast(char)(0x80 | (c & 0x3F));
2179 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2202 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2203 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2204 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2205 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2206 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2208 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2215 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2221 … c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2225 buf[0] = cast(wchar) c;
2231 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2232 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2236 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2255 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2256 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2257 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2258 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2259 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2261 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2268 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2272 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2293 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2294 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2295 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2296 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2297 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2299 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride);
2311 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2319 r ~= cast(char) c;
2329 buf[0] = cast(char)(0xC0 | (c >> 6));
2330 buf[1] = cast(char)(0x80 | (c & 0x3F));
2336 … c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c);
2340 buf[0] = cast(char)(0xE0 | (c >> 12));
2341 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2342 buf[2] = cast(char)(0x80 | (c & 0x3F));
2348 buf[0] = cast(char)(0xF0 | (c >> 18));
2349 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
2350 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
2351 buf[3] = cast(char)(0x80 | (c & 0x3F));
2357 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c);
2372 encode(s, cast(dchar)'a');
2376 encode(s, cast(dchar)'\u00A9');
2381 encode(s, cast(dchar)'\u2260');
2406 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2407 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2408 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2409 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2410 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2413 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2419 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2427 … c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c);
2431 r ~= cast(wchar) c;
2438 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
2439 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
2445 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c);
2467 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2468 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2469 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2470 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2471 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2474 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2480 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)(
2484 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c);
2504 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800));
2505 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF));
2506 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00));
2507 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF));
2508 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000));
2511 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000);
2661 assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF
2667 assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00));
2668 assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00));
2677 Checks to see if $(D str) is well-formed unicode or not.
2680 $(D UTFException) if $(D str) is not well-formed.
2703 * Encodes the elements of `s` to UTF-8 and returns a newly allocated
2709 * A UTF-8 string
2711 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2724 // The ö is represented by two UTF-8 code units
2727 // is four code units in UTF-8
2744 * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated
2750 * A UTF-16 string
2752 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2765 // these graphemes are two code units in UTF-16 and one in UTF-32
2787 * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated
2793 * A UTF-32 string
2795 * For a lazy, non-allocating version of these functions, see $(LREF byUTF).
2827 Returns a C-style zero-terminated string equivalent to $(D str). $(D str) in toUTFImpl()
2835 to allocate a new string - particularly when dealing with character types in toUTFImpl()
2841 zero-terminated anymore. The most likely scenarios for that are if you in toUTFImpl()
2842 append to $(D str) and no reallocation takes place or when $(D str) is a in toUTFImpl()
2847 user-defined type with one declared right after the other) and that in toUTFImpl()
2849 occur if you immediately use the zero-terminated string after calling in toUTFImpl()
2851 Also, they are unlikely to occur even if you save the zero-terminated string in toUTFImpl()
2853 However, if you save the zero-terminate string and want to be absolutely in toUTFImpl()
2854 certain that the string stays zero-terminated, then simply append a in toUTFImpl()
2886 //immutable(C)[] -> C*, const(C)*, or immutable(C)*
2901 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
2910 // Peek past end of str, if it's 0, no conversion necessary.
2919 if ((cast(size_t) p & 3) && *p == '\0')
2923 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
2931 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
2936 //const(C)[] -> const(C)* or
2937 //C[] -> C* or const(C)*
2946 if ((cast(size_t) p & 3) && *p == '\0')
2953 //const(C)[] -> C* or immutable(C)* or
2954 //C[] -> immutable(C)*
2959 copy[0 .. $ - 1] = str[];
2960 copy[$ - 1] = '\0';
2962 auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; } in trustedCast()
2970 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
2979 return () @trusted { return cast(P) retval.data.ptr; } ();
2998 temp[0 .. $ - 1] = s1[0 .. $];
2999 temp[$ - 1] = '\n';
3000 --temp.length;
3073 Encodes string $(D s) into UTF-16 and returns the encoded string.
3130 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
3133 $(D UTFException) if $(D str) is not well-formed.
3178 @property C back() { return _str[$ - 1]; } in version()
3179 void popBack() { _str = _str[0 .. $ - 1]; } in version()
3197 @property C back() { return _str[$ - 1]; } in version()
3198 void popBack() { _str = _str[0 .. $ - 1]; } in version()
3218 @property C back() { return _str[$ - 1]; } in version()
3219 void popBack() { _str = _str[0 .. $ - 1]; } in version()
3237 @property C back() { return _str[$ - 1]; } in version()
3238 void popBack() { _str = _str[0 .. $ - 1]; } in version()
3274 * are encoded with multiple code units. For example, the UTF-8 code units for
3283 * If `r` is not an auto-decodable string (i.e. a narrow string or a
3284 * user-defined type that implicits converts to a string type), then `r`
3288 * not already a string) and wrapped in a random-access range where the
3295 * `r` is returned, and no implicit conversion takes place.
3324 @property auto ref back() inout { return str[$ - 1]; } in back()
3325 void popBack() { str = str[0 .. $-1]; } in popBack()
3344 return cast(StringTypeOf!R) r;
3348 // byCodeUnit for ranges and dchar[] is a no-op
3374 /// `byCodeUnit` does no Unicode decoding
3381 string noel2 = "no\u00EBl"; // noël using a precomposed ë character
3382 // Because string is UTF-8, the code unit at index 2 is just
3477 assert(bcu.front == cast(char) 244);
3492 assert(bcu.front == cast(wchar) 56319);
3506 assert(bcu.front == cast(dchar) 1114104);
3522 assert(bcu.front == cast(char) 244);
3538 assert(bcu.front == cast(char) 244);
3710 a[9] = cast(dchar) 0x110000; // invalid
3747 a[9] = cast(dchar) 0x110000; // invalid
3919 * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the
3926 * A forward range if `R` is a range and not auto-decodable, as defined by
3930 * Or, if `R` is a range and it is auto-decodable and
3978 buf[pos] = cast(C) c; in front()
3989 fill = cast(ushort) encode!(Yes.useReplacementDchar)(buf, dc); in front()
4032 // hellö as a range of `char`s, which are UTF-8
4035 // `wchar`s are able to hold the ö in a single element (UTF-16 code unit)
4038 // is four code units in UTF-8, two in UTF-16, and one in UTF-32