1 /********************************************
2 * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
3 *
4 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
5 * wchar type.
6 * For Posix systems, the C wchar_t type is UTF-32 and corresponds to
7 * the D utf.dchar type.
8 *
9 * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF).
10 *
11 * See_Also:
12 * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
13 * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
14 * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
15 *
16 * Copyright: Copyright Digital Mars 2003 - 2016.
17 * License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
18 * Authors: Walter Bright, Sean Kelly
19 * Source: $(DRUNTIMESRC src/rt/util/_utf.d)
20 */
21
22 module rt.util.utf;
23
24 extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure;
25
26 /*******************************
27 * Test if c is a valid UTF-32 character.
28 *
29 * \uFFFE and \uFFFF are considered valid by this function,
30 * as they are permitted for internal use by an application,
31 * but they are not allowed for interchange by the Unicode standard.
32 *
33 * Returns: true if it is, false if not.
34 */
35
36 @safe @nogc pure nothrow
isValidDchar(dchar c)37 bool isValidDchar(dchar c)
38 {
39 /* Note: FFFE and FFFF are specifically permitted by the
40 * Unicode standard for application internal use, but are not
41 * allowed for interchange.
42 * (thanks to Arcane Jill)
43 */
44
45 return c < 0xD800 ||
46 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
47 }
48
49 unittest
50 {
51 debug(utf) printf("utf.isValidDchar.unittest\n");
52 assert(isValidDchar(cast(dchar)'a') == true);
53 assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
54 }
55
56
57
58 static immutable UTF8stride =
59 [
60 cast(ubyte)
61 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
62 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
63 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
64 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
65 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
66 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
70 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
71 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
72 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
73 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
74 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
75 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
76 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
77 ];
78
79 /**
80 * stride() returns the length of a UTF-8 sequence starting at index i
81 * in string s.
82 * Returns:
83 * The number of bytes in the UTF-8 sequence or
84 * 0xFF meaning s[i] is not the start of of UTF-8 sequence.
85 */
86 @safe @nogc pure nothrow
stride(in char[]s,size_t i)87 uint stride(in char[] s, size_t i)
88 {
89 return UTF8stride[s[i]];
90 }
91
92 /**
93 * stride() returns the length of a UTF-16 sequence starting at index i
94 * in string s.
95 */
96 @safe @nogc pure nothrow
stride(in wchar[]s,size_t i)97 uint stride(in wchar[] s, size_t i)
98 { uint u = s[i];
99 return 1 + (u >= 0xD800 && u <= 0xDBFF);
100 }
101
102 /**
103 * stride() returns the length of a UTF-32 sequence starting at index i
104 * in string s.
105 * Returns: The return value will always be 1.
106 */
107 @safe @nogc pure nothrow
stride(in dchar[]s,size_t i)108 uint stride(in dchar[] s, size_t i)
109 {
110 return 1;
111 }
112
113 /*******************************************
114 * Given an index i into an array of characters s[],
115 * and assuming that index i is at the start of a UTF character,
116 * determine the number of UCS characters up to that index i.
117 */
118 @safe pure
toUCSindex(in char[]s,size_t i)119 size_t toUCSindex(in char[] s, size_t i)
120 {
121 size_t n;
122 size_t j;
123
124 for (j = 0; j < i; )
125 {
126 j += stride(s, j);
127 n++;
128 }
129 if (j > i)
130 {
131 onUnicodeError("invalid UTF-8 sequence", j);
132 }
133 return n;
134 }
135
136 /** ditto */
137 @safe pure
toUCSindex(in wchar[]s,size_t i)138 size_t toUCSindex(in wchar[] s, size_t i)
139 {
140 size_t n;
141 size_t j;
142
143 for (j = 0; j < i; )
144 {
145 j += stride(s, j);
146 n++;
147 }
148 if (j > i)
149 {
150 onUnicodeError("invalid UTF-16 sequence", j);
151 }
152 return n;
153 }
154
155 /** ditto */
156 @safe @nogc pure nothrow
toUCSindex(in dchar[]s,size_t i)157 size_t toUCSindex(in dchar[] s, size_t i)
158 {
159 return i;
160 }
161
162 /******************************************
163 * Given a UCS index n into an array of characters s[], return the UTF index.
164 */
165 @safe pure
toUTFindex(in char[]s,size_t n)166 size_t toUTFindex(in char[] s, size_t n)
167 {
168 size_t i;
169
170 while (n--)
171 {
172 uint j = UTF8stride[s[i]];
173 if (j == 0xFF)
174 onUnicodeError("invalid UTF-8 sequence", i);
175 i += j;
176 }
177 return i;
178 }
179
180 /** ditto */
181 @safe @nogc pure nothrow
toUTFindex(in wchar[]s,size_t n)182 size_t toUTFindex(in wchar[] s, size_t n)
183 {
184 size_t i;
185
186 while (n--)
187 { wchar u = s[i];
188
189 i += 1 + (u >= 0xD800 && u <= 0xDBFF);
190 }
191 return i;
192 }
193
194 /** ditto */
195 @safe @nogc pure nothrow
toUTFindex(in dchar[]s,size_t n)196 size_t toUTFindex(in dchar[] s, size_t n)
197 {
198 return n;
199 }
200
201 /* =================== Decode ======================= */
202
203 /***************
204 * Decodes and returns character starting at s[idx]. idx is advanced past the
205 * decoded character. If the character is not well formed, a UtfException is
206 * thrown and idx remains unchanged.
207 */
208 @safe pure
decode(in char[]s,ref size_t idx)209 dchar decode(in char[] s, ref size_t idx)
210 in
211 {
212 assert(idx >= 0 && idx < s.length);
213 }
out(result)214 out (result)
215 {
216 assert(isValidDchar(result));
217 }
218 body
219 {
220 size_t len = s.length;
221 dchar V;
222 size_t i = idx;
223 char u = s[i];
224
225 if (u & 0x80)
226 { uint n;
227 char u2;
228
229 /* The following encodings are valid, except for the 5 and 6 byte
230 * combinations:
231 * 0xxxxxxx
232 * 110xxxxx 10xxxxxx
233 * 1110xxxx 10xxxxxx 10xxxxxx
234 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
235 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
236 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
237 */
238 for (n = 1; ; n++)
239 {
240 if (n > 4)
241 goto Lerr; // only do the first 4 of 6 encodings
242 if (((u << n) & 0x80) == 0)
243 {
244 if (n == 1)
245 goto Lerr;
246 break;
247 }
248 }
249
250 // Pick off (7 - n) significant bits of B from first byte of octet
251 V = cast(dchar)(u & ((1 << (7 - n)) - 1));
252
253 if (i + (n - 1) >= len)
254 goto Lerr; // off end of string
255
256 /* The following combinations are overlong, and illegal:
257 * 1100000x (10xxxxxx)
258 * 11100000 100xxxxx (10xxxxxx)
259 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
260 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
261 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
262 */
263 u2 = s[i + 1];
264 if ((u & 0xFE) == 0xC0 ||
265 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
266 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
267 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
268 (u == 0xFC && (u2 & 0xFC) == 0x80))
269 goto Lerr; // overlong combination
270
271 for (uint j = 1; j != n; j++)
272 {
273 u = s[i + j];
274 if ((u & 0xC0) != 0x80)
275 goto Lerr; // trailing bytes are 10xxxxxx
276 V = (V << 6) | (u & 0x3F);
277 }
278 if (!isValidDchar(V))
279 goto Lerr;
280 i += n;
281 }
282 else
283 {
284 V = cast(dchar) u;
285 i++;
286 }
287
288 idx = i;
289 return V;
290
291 Lerr:
292 onUnicodeError("invalid UTF-8 sequence", i);
293 return V; // dummy return
294 }
295
296 unittest
297 { size_t i;
298 dchar c;
299
300 debug(utf) printf("utf.decode.unittest\n");
301
302 static s1 = "abcd"c;
303 i = 0;
304 c = decode(s1, i);
305 assert(c == cast(dchar)'a');
306 assert(i == 1);
307 c = decode(s1, i);
308 assert(c == cast(dchar)'b');
309 assert(i == 2);
310
311 static s2 = "\xC2\xA9"c;
312 i = 0;
313 c = decode(s2, i);
314 assert(c == cast(dchar)'\u00A9');
315 assert(i == 2);
316
317 static s3 = "\xE2\x89\xA0"c;
318 i = 0;
319 c = decode(s3, i);
320 assert(c == cast(dchar)'\u2260');
321 assert(i == 3);
322
323 static s4 =
324 [ "\xE2\x89"c[], // too short
325 "\xC0\x8A",
326 "\xE0\x80\x8A",
327 "\xF0\x80\x80\x8A",
328 "\xF8\x80\x80\x80\x8A",
329 "\xFC\x80\x80\x80\x80\x8A",
330 ];
331
332 for (int j = 0; j < s4.length; j++)
333 {
334 try
335 {
336 i = 0;
337 c = decode(s4[j], i);
338 assert(0);
339 }
catch(Throwable o)340 catch (Throwable o)
341 {
342 i = 23;
343 }
344 assert(i == 23);
345 }
346 }
347
348 /** ditto */
349 @safe pure
decode(in wchar[]s,ref size_t idx)350 dchar decode(in wchar[] s, ref size_t idx)
351 in
352 {
353 assert(idx >= 0 && idx < s.length);
354 }
out(result)355 out (result)
356 {
357 assert(isValidDchar(result));
358 }
359 body
360 {
361 string msg;
362 dchar V;
363 size_t i = idx;
364 uint u = s[i];
365
366 if (u & ~0x7F)
367 { if (u >= 0xD800 && u <= 0xDBFF)
368 { uint u2;
369
370 if (i + 1 == s.length)
371 { msg = "surrogate UTF-16 high value past end of string";
372 goto Lerr;
373 }
374 u2 = s[i + 1];
375 if (u2 < 0xDC00 || u2 > 0xDFFF)
376 { msg = "surrogate UTF-16 low value out of range";
377 goto Lerr;
378 }
379 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
380 i += 2;
381 }
382 else if (u >= 0xDC00 && u <= 0xDFFF)
383 { msg = "unpaired surrogate UTF-16 value";
384 goto Lerr;
385 }
386 else if (u == 0xFFFE || u == 0xFFFF)
387 { msg = "illegal UTF-16 value";
388 goto Lerr;
389 }
390 else
391 i++;
392 }
393 else
394 {
395 i++;
396 }
397
398 idx = i;
399 return cast(dchar)u;
400
401 Lerr:
402 onUnicodeError(msg, i);
403 return cast(dchar)u; // dummy return
404 }
405
406 /** ditto */
407 @safe pure
decode(in dchar[]s,ref size_t idx)408 dchar decode(in dchar[] s, ref size_t idx)
409 in
410 {
411 assert(idx >= 0 && idx < s.length);
412 }
413 body
414 {
415 size_t i = idx;
416 dchar c = s[i];
417
418 if (!isValidDchar(c))
419 goto Lerr;
420 idx = i + 1;
421 return c;
422
423 Lerr:
424 onUnicodeError("invalid UTF-32 value", i);
425 return c; // dummy return
426 }
427
428
429 /* =================== Encode ======================= */
430
431 /*******************************
432 * Encodes character c and appends it to array s[].
433 */
434 @safe pure nothrow
encode(ref char[]s,dchar c)435 void encode(ref char[] s, dchar c)
436 in
437 {
438 assert(isValidDchar(c));
439 }
440 body
441 {
442 char[] r = s;
443
444 if (c <= 0x7F)
445 {
446 r ~= cast(char) c;
447 }
448 else
449 {
450 char[4] buf;
451 uint L;
452
453 if (c <= 0x7FF)
454 {
455 buf[0] = cast(char)(0xC0 | (c >> 6));
456 buf[1] = cast(char)(0x80 | (c & 0x3F));
457 L = 2;
458 }
459 else if (c <= 0xFFFF)
460 {
461 buf[0] = cast(char)(0xE0 | (c >> 12));
462 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
463 buf[2] = cast(char)(0x80 | (c & 0x3F));
464 L = 3;
465 }
466 else if (c <= 0x10FFFF)
467 {
468 buf[0] = cast(char)(0xF0 | (c >> 18));
469 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
470 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
471 buf[3] = cast(char)(0x80 | (c & 0x3F));
472 L = 4;
473 }
474 else
475 {
476 assert(0);
477 }
478 r ~= buf[0 .. L];
479 }
480 s = r;
481 }
482
483 unittest
484 {
485 debug(utf) printf("utf.encode.unittest\n");
486
487 char[] s = "abcd".dup;
488 encode(s, cast(dchar)'a');
489 assert(s.length == 5);
490 assert(s == "abcda");
491
492 encode(s, cast(dchar)'\u00A9');
493 assert(s.length == 7);
494 assert(s == "abcda\xC2\xA9");
495 //assert(s == "abcda\u00A9"); // BUG: fix compiler
496
497 encode(s, cast(dchar)'\u2260');
498 assert(s.length == 10);
499 assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
500 }
501
502 /** ditto */
503 @safe pure nothrow
encode(ref wchar[]s,dchar c)504 void encode(ref wchar[] s, dchar c)
505 in
506 {
507 assert(isValidDchar(c));
508 }
509 body
510 {
511 wchar[] r = s;
512
513 if (c <= 0xFFFF)
514 {
515 r ~= cast(wchar) c;
516 }
517 else
518 {
519 wchar[2] buf;
520
521 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
522 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
523 r ~= buf;
524 }
525 s = r;
526 }
527
528 /** ditto */
529 @safe pure nothrow
encode(ref dchar[]s,dchar c)530 void encode(ref dchar[] s, dchar c)
531 in
532 {
533 assert(isValidDchar(c));
534 }
535 body
536 {
537 s ~= c;
538 }
539
540 /**
541 Returns the code length of $(D c) in the encoding using $(D C) as a
542 code point. The code is returned in character count, not in bytes.
543 */
544 @safe pure nothrow @nogc
codeLength(C)545 ubyte codeLength(C)(dchar c)
546 {
547 static if (C.sizeof == 1)
548 {
549 if (c <= 0x7F) return 1;
550 if (c <= 0x7FF) return 2;
551 if (c <= 0xFFFF) return 3;
552 if (c <= 0x10FFFF) return 4;
553 assert(false);
554 }
555 else static if (C.sizeof == 2)
556 {
557 return c <= 0xFFFF ? 1 : 2;
558 }
559 else
560 {
561 static assert(C.sizeof == 4);
562 return 1;
563 }
564 }
565
566 /* =================== Validation ======================= */
567
568 /***********************************
569 Checks to see if string is well formed or not. $(D S) can be an array
570 of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
571 if it is not. Use to check all untrusted input for correctness.
572 */
573 @safe pure
validate(S)574 void validate(S)(in S s)
575 {
576 auto len = s.length;
577 for (size_t i = 0; i < len; )
578 {
579 decode(s, i);
580 }
581 }
582
583 /* =================== Conversion to UTF8 ======================= */
584
585 @safe pure nothrow @nogc
toUTF8(char[]buf,dchar c)586 char[] toUTF8(char[] buf, dchar c)
587 in
588 {
589 assert(isValidDchar(c));
590 }
591 body
592 {
593 if (c <= 0x7F)
594 {
595 buf[0] = cast(char) c;
596 return buf[0 .. 1];
597 }
598 else if (c <= 0x7FF)
599 {
600 buf[0] = cast(char)(0xC0 | (c >> 6));
601 buf[1] = cast(char)(0x80 | (c & 0x3F));
602 return buf[0 .. 2];
603 }
604 else if (c <= 0xFFFF)
605 {
606 buf[0] = cast(char)(0xE0 | (c >> 12));
607 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
608 buf[2] = cast(char)(0x80 | (c & 0x3F));
609 return buf[0 .. 3];
610 }
611 else if (c <= 0x10FFFF)
612 {
613 buf[0] = cast(char)(0xF0 | (c >> 18));
614 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
615 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
616 buf[3] = cast(char)(0x80 | (c & 0x3F));
617 return buf[0 .. 4];
618 }
619 assert(0);
620 }
621
622 /*******************
623 * Encodes string s into UTF-8 and returns the encoded string.
624 */
625 @safe pure nothrow
toUTF8(string s)626 string toUTF8(string s)
627 in
628 {
629 validate(s);
630 }
631 body
632 {
633 return s;
634 }
635
636 /** ditto */
637 @trusted pure
toUTF8(in wchar[]s)638 string toUTF8(in wchar[] s)
639 {
640 char[] r;
641 size_t i;
642 size_t slen = s.length;
643
644 r.length = slen;
645
646 for (i = 0; i < slen; i++)
647 { wchar c = s[i];
648
649 if (c <= 0x7F)
650 r[i] = cast(char)c; // fast path for ascii
651 else
652 {
653 r.length = i;
654 foreach (dchar c; s[i .. slen])
655 {
656 encode(r, c);
657 }
658 break;
659 }
660 }
661 return cast(string)r;
662 }
663
664 /** ditto */
665 @trusted pure
toUTF8(in dchar[]s)666 string toUTF8(in dchar[] s)
667 {
668 char[] r;
669 size_t i;
670 size_t slen = s.length;
671
672 r.length = slen;
673
674 for (i = 0; i < slen; i++)
675 { dchar c = s[i];
676
677 if (c <= 0x7F)
678 r[i] = cast(char)c; // fast path for ascii
679 else
680 {
681 r.length = i;
682 foreach (dchar d; s[i .. slen])
683 {
684 encode(r, d);
685 }
686 break;
687 }
688 }
689 return cast(string)r;
690 }
691
692 /* =================== Conversion to UTF16 ======================= */
693
694 @safe pure nothrow @nogc
toUTF16(wchar[]buf,dchar c)695 wchar[] toUTF16(wchar[] buf, dchar c)
696 in
697 {
698 assert(isValidDchar(c));
699 }
700 body
701 {
702 if (c <= 0xFFFF)
703 {
704 buf[0] = cast(wchar) c;
705 return buf[0 .. 1];
706 }
707 else
708 {
709 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
710 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
711 return buf[0 .. 2];
712 }
713 }
714
715 /****************
716 * Encodes string s into UTF-16 and returns the encoded string.
717 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
718 * an LPWSTR or LPCWSTR argument.
719 */
720 @trusted pure
toUTF16(in char[]s)721 wstring toUTF16(in char[] s)
722 {
723 wchar[] r;
724 size_t slen = s.length;
725
726 r.length = slen;
727 r.length = 0;
728 for (size_t i = 0; i < slen; )
729 {
730 dchar c = s[i];
731 if (c <= 0x7F)
732 {
733 i++;
734 r ~= cast(wchar)c;
735 }
736 else
737 {
738 c = decode(s, i);
739 encode(r, c);
740 }
741 }
742 return cast(wstring)r;
743 }
744
745 alias const(wchar)* wptr;
746 /** ditto */
747 @safe pure
toUTF16z(in char[]s)748 wptr toUTF16z(in char[] s)
749 {
750 wchar[] r;
751 size_t slen = s.length;
752
753 r.length = slen + 1;
754 r.length = 0;
755 for (size_t i = 0; i < slen; )
756 {
757 dchar c = s[i];
758 if (c <= 0x7F)
759 {
760 i++;
761 r ~= cast(wchar)c;
762 }
763 else
764 {
765 c = decode(s, i);
766 encode(r, c);
767 }
768 }
769 r ~= '\000';
770 return &r[0];
771 }
772
773 /** ditto */
774 @safe pure nothrow
toUTF16(wstring s)775 wstring toUTF16(wstring s)
776 in
777 {
778 validate(s);
779 }
780 body
781 {
782 return s;
783 }
784
785 /** ditto */
786 @trusted pure nothrow
toUTF16(in dchar[]s)787 wstring toUTF16(in dchar[] s)
788 {
789 wchar[] r;
790 size_t slen = s.length;
791
792 r.length = slen;
793 r.length = 0;
794 for (size_t i = 0; i < slen; i++)
795 {
796 encode(r, s[i]);
797 }
798 return cast(wstring)r;
799 }
800
801 /* =================== Conversion to UTF32 ======================= */
802
803 /*****
804 * Encodes string s into UTF-32 and returns the encoded string.
805 */
806 @trusted pure
toUTF32(in char[]s)807 dstring toUTF32(in char[] s)
808 {
809 dchar[] r;
810 size_t slen = s.length;
811 size_t j = 0;
812
813 r.length = slen; // r[] will never be longer than s[]
814 for (size_t i = 0; i < slen; )
815 {
816 dchar c = s[i];
817 if (c >= 0x80)
818 c = decode(s, i);
819 else
820 i++; // c is ascii, no need for decode
821 r[j++] = c;
822 }
823 return cast(dstring)r[0 .. j];
824 }
825
826 /** ditto */
827 @trusted pure
toUTF32(in wchar[]s)828 dstring toUTF32(in wchar[] s)
829 {
830 dchar[] r;
831 size_t slen = s.length;
832 size_t j = 0;
833
834 r.length = slen; // r[] will never be longer than s[]
835 for (size_t i = 0; i < slen; )
836 {
837 dchar c = s[i];
838 if (c >= 0x80)
839 c = decode(s, i);
840 else
841 i++; // c is ascii, no need for decode
842 r[j++] = c;
843 }
844 return cast(dstring)r[0 .. j];
845 }
846
847 /** ditto */
848 @safe pure nothrow
toUTF32(dstring s)849 dstring toUTF32(dstring s)
850 in
851 {
852 validate(s);
853 }
854 body
855 {
856 return s;
857 }
858
859 /* ================================ tests ================================== */
860
861 unittest
862 {
863 debug(utf) printf("utf.toUTF.unittest\n");
864
865 auto c = "hello"c[];
866 auto w = toUTF16(c);
867 assert(w == "hello");
868 auto d = toUTF32(c);
869 assert(d == "hello");
870
871 c = toUTF8(w);
872 assert(c == "hello");
873 d = toUTF32(w);
874 assert(d == "hello");
875
876 c = toUTF8(d);
877 assert(c == "hello");
878 w = toUTF16(d);
879 assert(w == "hello");
880
881
882 c = "hel\u1234o";
883 w = toUTF16(c);
884 assert(w == "hel\u1234o");
885 d = toUTF32(c);
886 assert(d == "hel\u1234o");
887
888 c = toUTF8(w);
889 assert(c == "hel\u1234o");
890 d = toUTF32(w);
891 assert(d == "hel\u1234o");
892
893 c = toUTF8(d);
894 assert(c == "hel\u1234o");
895 w = toUTF16(d);
896 assert(w == "hel\u1234o");
897
898
899 c = "he\U000BAAAAllo";
900 w = toUTF16(c);
901 //foreach (wchar c; w) printf("c = x%x\n", c);
902 //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c);
903 assert(w == "he\U000BAAAAllo");
904 d = toUTF32(c);
905 assert(d == "he\U000BAAAAllo");
906
907 c = toUTF8(w);
908 assert(c == "he\U000BAAAAllo");
909 d = toUTF32(w);
910 assert(d == "he\U000BAAAAllo");
911
912 c = toUTF8(d);
913 assert(c == "he\U000BAAAAllo");
914 w = toUTF16(d);
915 assert(w == "he\U000BAAAAllo");
916
917 wchar[2] buf;
918 auto ret = toUTF16(buf, '\U000BAAAA');
919 assert(ret == "\U000BAAAA");
920 }
921