xref: /netbsd-src/external/gpl3/gcc/dist/libphobos/libdruntime/core/internal/utf.d (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1 /********************************************
2  * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
3  *
4  * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
5  * wchar type.
6  * For Posix systems, the C wchar_t type is UTF-32 and corresponds to
7  * the D utf.dchar type.
8  *
9  * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF).
10  *
11  * See_Also:
12  *      $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
13  *      $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
14  *      $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
15  *
16  * Copyright: Copyright Digital Mars 2003 - 2016.
17  * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
18  * Authors:   Walter Bright, Sean Kelly
19  * Source:    $(DRUNTIMESRC core/internal/_utf.d)
20  */
21 
22 module core.internal.utf;
23 
24 extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure;
25 
26 /*******************************
27  * Test if c is a valid UTF-32 character.
28  *
29  * \uFFFE and \uFFFF are considered valid by this function,
30  * as they are permitted for internal use by an application,
31  * but they are not allowed for interchange by the Unicode standard.
32  *
33  * Returns: true if it is, false if not.
34  */
35 
36 @safe @nogc pure nothrow
isValidDchar(dchar c)37 bool isValidDchar(dchar c)
38 {
39     /* Note: FFFE and FFFF are specifically permitted by the
40      * Unicode standard for application internal use, but are not
41      * allowed for interchange.
42      * (thanks to Arcane Jill)
43      */
44 
45     return c < 0xD800 ||
46         (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
47 }
48 
49 unittest
50 {
51     debug(utf) printf("utf.isValidDchar.unittest\n");
52     assert(isValidDchar(cast(dchar)'a') == true);
53     assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
54 }
55 
56 
57 
58 static immutable UTF8stride =
59 [
60     cast(ubyte)
61     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
62     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
63     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
64     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
65     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
66     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
67     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
70     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
71     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
72     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
73     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
74     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
75     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
76     4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
77 ];
78 
79 /**
80  * stride() returns the length of a UTF-8 sequence starting at index i
81  * in string s.
82  * Returns:
83  *      The number of bytes in the UTF-8 sequence or
84  *      0xFF meaning s[i] is not the start of of UTF-8 sequence.
85  */
86 @safe @nogc pure nothrow
stride(const scope char[]s,size_t i)87 uint stride(const scope char[] s, size_t i)
88 {
89     return UTF8stride[s[i]];
90 }
91 
92 /**
93  * stride() returns the length of a UTF-16 sequence starting at index i
94  * in string s.
95  */
96 @safe @nogc pure nothrow
stride(const scope wchar[]s,size_t i)97 uint stride(const scope wchar[] s, size_t i)
98 {   uint u = s[i];
99     return 1 + (u >= 0xD800 && u <= 0xDBFF);
100 }
101 
102 /**
103  * stride() returns the length of a UTF-32 sequence starting at index i
104  * in string s.
105  * Returns: The return value will always be 1.
106  */
107 @safe @nogc pure nothrow
stride(const scope dchar[]s,size_t i)108 uint stride(const scope dchar[] s, size_t i)
109 {
110     return 1;
111 }
112 
113 /*******************************************
114  * Given an index i into an array of characters s[],
115  * and assuming that index i is at the start of a UTF character,
116  * determine the number of UCS characters up to that index i.
117  */
118 @safe pure
toUCSindex(const scope char[]s,size_t i)119 size_t toUCSindex(const scope char[] s, size_t i)
120 {
121     size_t n;
122     size_t j;
123 
124     for (j = 0; j < i; )
125     {
126         j += stride(s, j);
127         n++;
128     }
129     if (j > i)
130     {
131         onUnicodeError("invalid UTF-8 sequence", j);
132     }
133     return n;
134 }
135 
136 /** ditto */
137 @safe pure
toUCSindex(const scope wchar[]s,size_t i)138 size_t toUCSindex(const scope wchar[] s, size_t i)
139 {
140     size_t n;
141     size_t j;
142 
143     for (j = 0; j < i; )
144     {
145         j += stride(s, j);
146         n++;
147     }
148     if (j > i)
149     {
150         onUnicodeError("invalid UTF-16 sequence", j);
151     }
152     return n;
153 }
154 
155 /** ditto */
156 @safe @nogc pure nothrow
toUCSindex(const scope dchar[]s,size_t i)157 size_t toUCSindex(const scope dchar[] s, size_t i)
158 {
159     return i;
160 }
161 
162 /******************************************
163  * Given a UCS index n into an array of characters s[], return the UTF index.
164  */
165 @safe pure
toUTFindex(const scope char[]s,size_t n)166 size_t toUTFindex(const scope char[] s, size_t n)
167 {
168     size_t i;
169 
170     while (n--)
171     {
172         uint j = UTF8stride[s[i]];
173         if (j == 0xFF)
174             onUnicodeError("invalid UTF-8 sequence", i);
175         i += j;
176     }
177     return i;
178 }
179 
180 /** ditto */
181 @safe @nogc pure nothrow
toUTFindex(const scope wchar[]s,size_t n)182 size_t toUTFindex(const scope wchar[] s, size_t n)
183 {
184     size_t i;
185 
186     while (n--)
187     {   wchar u = s[i];
188 
189         i += 1 + (u >= 0xD800 && u <= 0xDBFF);
190     }
191     return i;
192 }
193 
194 /** ditto */
195 @safe @nogc pure nothrow
toUTFindex(const scope dchar[]s,size_t n)196 size_t toUTFindex(const scope dchar[] s, size_t n)
197 {
198     return n;
199 }
200 
201 /* =================== Decode ======================= */
202 
203 /***************
204  * Decodes and returns character starting at s[idx]. idx is advanced past the
205  * decoded character. If the character is not well formed, a UtfException is
206  * thrown and idx remains unchanged.
207  */
208 @safe pure
decode(const scope char[]s,ref size_t idx)209 dchar decode(const scope char[] s, ref size_t idx)
210     in
211     {
212         assert(idx >= 0 && idx < s.length);
213     }
out(result)214     out (result)
215     {
216         assert(isValidDchar(result));
217     }
218     do
219     {
220         size_t len = s.length;
221         dchar V;
222         size_t i = idx;
223         char u = s[i];
224 
225         if (u & 0x80)
226         {   uint n;
227             char u2;
228 
229             /* The following encodings are valid, except for the 5 and 6 byte
230              * combinations:
231              *  0xxxxxxx
232              *  110xxxxx 10xxxxxx
233              *  1110xxxx 10xxxxxx 10xxxxxx
234              *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
235              *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
236              *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
237              */
238             for (n = 1; ; n++)
239             {
240                 if (n > 4)
241                     goto Lerr;          // only do the first 4 of 6 encodings
242                 if (((u << n) & 0x80) == 0)
243                 {
244                     if (n == 1)
245                         goto Lerr;
246                     break;
247                 }
248             }
249 
250             // Pick off (7 - n) significant bits of B from first byte of octet
251             V = cast(dchar)(u & ((1 << (7 - n)) - 1));
252 
253             if (i + (n - 1) >= len)
254                 goto Lerr;                      // off end of string
255 
256             /* The following combinations are overlong, and illegal:
257              *  1100000x (10xxxxxx)
258              *  11100000 100xxxxx (10xxxxxx)
259              *  11110000 1000xxxx (10xxxxxx 10xxxxxx)
260              *  11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
261              *  11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
262              */
263             u2 = s[i + 1];
264             if ((u & 0xFE) == 0xC0 ||
265                 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
266                 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
267                 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
268                 (u == 0xFC && (u2 & 0xFC) == 0x80))
269                 goto Lerr;                      // overlong combination
270 
271             for (uint j = 1; j != n; j++)
272             {
273                 u = s[i + j];
274                 if ((u & 0xC0) != 0x80)
275                     goto Lerr;                  // trailing bytes are 10xxxxxx
276                 V = (V << 6) | (u & 0x3F);
277             }
278             if (!isValidDchar(V))
279                 goto Lerr;
280             i += n;
281         }
282         else
283         {
284             V = cast(dchar) u;
285             i++;
286         }
287 
288         idx = i;
289         return V;
290 
291       Lerr:
292       onUnicodeError("invalid UTF-8 sequence", i);
293     return V; // dummy return
294     }
295 
296 unittest
297 {   size_t i;
298     dchar c;
299 
300     debug(utf) printf("utf.decode.unittest\n");
301 
302     static s1 = "abcd"c;
303     i = 0;
304     c = decode(s1, i);
305     assert(c == cast(dchar)'a');
306     assert(i == 1);
307     c = decode(s1, i);
308     assert(c == cast(dchar)'b');
309     assert(i == 2);
310 
311     static s2 = "\xC2\xA9"c;
312     i = 0;
313     c = decode(s2, i);
314     assert(c == cast(dchar)'\u00A9');
315     assert(i == 2);
316 
317     static s3 = "\xE2\x89\xA0"c;
318     i = 0;
319     c = decode(s3, i);
320     assert(c == cast(dchar)'\u2260');
321     assert(i == 3);
322 
323     static s4 =
324     [   "\xE2\x89"c[],          // too short
325         "\xC0\x8A",
326         "\xE0\x80\x8A",
327         "\xF0\x80\x80\x8A",
328         "\xF8\x80\x80\x80\x8A",
329         "\xFC\x80\x80\x80\x80\x8A",
330     ];
331 
332     for (int j = 0; j < s4.length; j++)
333     {
334         try
335         {
336             i = 0;
337             c = decode(s4[j], i);
338             assert(0);
339         }
catch(Throwable o)340         catch (Throwable o)
341         {
342             i = 23;
343         }
344         assert(i == 23);
345     }
346 }
347 
348 /** ditto */
349 @safe pure
decode(const scope wchar[]s,ref size_t idx)350 dchar decode(const scope wchar[] s, ref size_t idx)
351     in
352     {
353         assert(idx >= 0 && idx < s.length);
354     }
out(result)355     out (result)
356     {
357         assert(isValidDchar(result));
358     }
359     do
360     {
361         string msg;
362         dchar V;
363         size_t i = idx;
364         uint u = s[i];
365 
366         if (u & ~0x7F)
367         {   if (u >= 0xD800 && u <= 0xDBFF)
368             {   uint u2;
369 
370                 if (i + 1 == s.length)
371                 {   msg = "surrogate UTF-16 high value past end of string";
372                     goto Lerr;
373                 }
374                 u2 = s[i + 1];
375                 if (u2 < 0xDC00 || u2 > 0xDFFF)
376                 {   msg = "surrogate UTF-16 low value out of range";
377                     goto Lerr;
378                 }
379                 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
380                 i += 2;
381             }
382             else if (u >= 0xDC00 && u <= 0xDFFF)
383             {   msg = "unpaired surrogate UTF-16 value";
384                 goto Lerr;
385             }
386             else if (u == 0xFFFE || u == 0xFFFF)
387             {   msg = "illegal UTF-16 value";
388                 goto Lerr;
389             }
390             else
391                 i++;
392         }
393         else
394         {
395             i++;
396         }
397 
398         idx = i;
399         return cast(dchar)u;
400 
401       Lerr:
402           onUnicodeError(msg, i);
403         return cast(dchar)u; // dummy return
404     }
405 
406 /** ditto */
407 @safe pure
decode(const scope dchar[]s,ref size_t idx)408 dchar decode(const scope dchar[] s, ref size_t idx)
409     in
410     {
411         assert(idx >= 0 && idx < s.length);
412     }
413     do
414     {
415         size_t i = idx;
416         dchar c = s[i];
417 
418         if (!isValidDchar(c))
419             goto Lerr;
420         idx = i + 1;
421         return c;
422 
423       Lerr:
424           onUnicodeError("invalid UTF-32 value", i);
425         return c; // dummy return
426     }
427 
428 
429 /* =================== Encode ======================= */
430 
431 /*******************************
432  * Encodes character c and appends it to array s[].
433  */
434 @safe pure nothrow
encode(ref char[]s,dchar c)435 void encode(ref char[] s, dchar c)
436     in
437     {
438         assert(isValidDchar(c));
439     }
440     do
441     {
442         char[] r = s;
443 
444         if (c <= 0x7F)
445         {
446             r ~= cast(char) c;
447         }
448         else
449         {
450             char[4] buf = void;
451             uint L;
452 
453             if (c <= 0x7FF)
454             {
455                 buf[0] = cast(char)(0xC0 | (c >> 6));
456                 buf[1] = cast(char)(0x80 | (c & 0x3F));
457                 L = 2;
458             }
459             else if (c <= 0xFFFF)
460             {
461                 buf[0] = cast(char)(0xE0 | (c >> 12));
462                 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
463                 buf[2] = cast(char)(0x80 | (c & 0x3F));
464                 L = 3;
465             }
466             else if (c <= 0x10FFFF)
467             {
468                 buf[0] = cast(char)(0xF0 | (c >> 18));
469                 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
470                 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
471                 buf[3] = cast(char)(0x80 | (c & 0x3F));
472                 L = 4;
473             }
474             else
475             {
476                 assert(0);
477             }
478             r ~= buf[0 .. L];
479         }
480         s = r;
481     }
482 
483 unittest
484 {
485     debug(utf) printf("utf.encode.unittest\n");
486 
487     char[] s = "abcd".dup;
488     encode(s, cast(dchar)'a');
489     assert(s.length == 5);
490     assert(s == "abcda");
491 
492     encode(s, cast(dchar)'\u00A9');
493     assert(s.length == 7);
494     assert(s == "abcda\xC2\xA9");
495     //assert(s == "abcda\u00A9");       // BUG: fix compiler
496 
497     encode(s, cast(dchar)'\u2260');
498     assert(s.length == 10);
499     assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
500 }
501 
502 /** ditto */
503 @safe pure nothrow
encode(ref wchar[]s,dchar c)504 void encode(ref wchar[] s, dchar c)
505     in
506     {
507         assert(isValidDchar(c));
508     }
509     do
510     {
511         wchar[] r = s;
512 
513         if (c <= 0xFFFF)
514         {
515             r ~= cast(wchar) c;
516         }
517         else
518         {
519             wchar[2] buf = void;
520 
521             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
522             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
523             r ~= buf;
524         }
525         s = r;
526     }
527 
528 /** ditto */
529 @safe pure nothrow
encode(ref dchar[]s,dchar c)530 void encode(ref dchar[] s, dchar c)
531     in
532     {
533         assert(isValidDchar(c));
534     }
535     do
536     {
537         s ~= c;
538     }
539 
540 /**
541 Returns the code length of $(D c) in the encoding using $(D C) as a
542 code point. The code is returned in character count, not in bytes.
543  */
544 @safe pure nothrow @nogc
codeLength(C)545 ubyte codeLength(C)(dchar c)
546 {
547     static if (C.sizeof == 1)
548     {
549         if (c <= 0x7F) return 1;
550         if (c <= 0x7FF) return 2;
551         if (c <= 0xFFFF) return 3;
552         if (c <= 0x10FFFF) return 4;
553         assert(false);
554     }
555     else static if (C.sizeof == 2)
556     {
557         return c <= 0xFFFF ? 1 : 2;
558     }
559     else
560     {
561         static assert(C.sizeof == 4);
562         return 1;
563     }
564 }
565 
566 /* =================== Validation ======================= */
567 
568 /***********************************
569 Checks to see if string is well formed or not. $(D S) can be an array
570  of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
571  if it is not. Use to check all untrusted input for correctness.
572  */
573 @safe pure
validate(S)574 void validate(S)(const scope S s)
575 {
576     auto len = s.length;
577     for (size_t i = 0; i < len; )
578     {
579         decode(s, i);
580     }
581 }
582 
583 /* =================== Conversion to UTF8 ======================= */
584 
585 @safe pure nothrow @nogc
toUTF8(return scope char[]buf,dchar c)586 char[] toUTF8(return scope char[] buf, dchar c)
587     in
588     {
589         assert(isValidDchar(c));
590     }
591     do
592     {
593         if (c <= 0x7F)
594         {
595             buf[0] = cast(char) c;
596             return buf[0 .. 1];
597         }
598         else if (c <= 0x7FF)
599         {
600             buf[0] = cast(char)(0xC0 | (c >> 6));
601             buf[1] = cast(char)(0x80 | (c & 0x3F));
602             return buf[0 .. 2];
603         }
604         else if (c <= 0xFFFF)
605         {
606             buf[0] = cast(char)(0xE0 | (c >> 12));
607             buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
608             buf[2] = cast(char)(0x80 | (c & 0x3F));
609             return buf[0 .. 3];
610         }
611         else if (c <= 0x10FFFF)
612         {
613             buf[0] = cast(char)(0xF0 | (c >> 18));
614             buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
615             buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
616             buf[3] = cast(char)(0x80 | (c & 0x3F));
617             return buf[0 .. 4];
618         }
619         assert(0);
620     }
621 
622 /*******************
623  * Encodes string s into UTF-8 and returns the encoded string.
624  */
625 @safe pure nothrow
toUTF8(return scope string s)626 string toUTF8(return scope string s)
627     in
628     {
629         validate(s);
630     }
631     do
632     {
633         return s;
634     }
635 
636 /** ditto */
637 @trusted pure
toUTF8(const scope wchar[]s)638 string toUTF8(const scope wchar[] s)
639 {
640     char[] r;
641     size_t i;
642     size_t slen = s.length;
643 
644     r.length = slen;
645 
646     for (i = 0; i < slen; i++)
647     {   wchar c = s[i];
648 
649         if (c <= 0x7F)
650             r[i] = cast(char)c;         // fast path for ascii
651         else
652         {
653             r.length = i;
654             foreach (dchar ch; s[i .. slen])
655             {
656                 encode(r, ch);
657             }
658             break;
659         }
660     }
661     return cast(string)r;
662 }
663 
664 /** ditto */
665 @trusted pure
toUTF8(const scope dchar[]s)666 string toUTF8(const scope dchar[] s)
667 {
668     char[] r;
669     size_t i;
670     size_t slen = s.length;
671 
672     r.length = slen;
673 
674     for (i = 0; i < slen; i++)
675     {   dchar c = s[i];
676 
677         if (c <= 0x7F)
678             r[i] = cast(char)c;         // fast path for ascii
679         else
680         {
681             r.length = i;
682             foreach (dchar d; s[i .. slen])
683             {
684                 encode(r, d);
685             }
686             break;
687         }
688     }
689     return cast(string)r;
690 }
691 
692 /* =================== Conversion to UTF16 ======================= */
693 
694 @safe pure nothrow @nogc
toUTF16(return scope wchar[]buf,dchar c)695 wchar[] toUTF16(return scope wchar[] buf, dchar c)
696     in
697     {
698         assert(isValidDchar(c));
699     }
700     do
701     {
702         if (c <= 0xFFFF)
703         {
704             buf[0] = cast(wchar) c;
705             return buf[0 .. 1];
706         }
707         else
708         {
709             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
710             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
711             return buf[0 .. 2];
712         }
713     }
714 
715 /****************
716  * Encodes string s into UTF-16 and returns the encoded string.
717  * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
718  * an LPWSTR or LPCWSTR argument.
719  */
720 @trusted pure
toUTF16(const scope char[]s)721 wstring toUTF16(const scope char[] s)
722 {
723     wchar[] r;
724     size_t slen = s.length;
725 
726     if (!__ctfe)
727     {
728         // Reserve still does a lot if slen is zero.
729         // Return early for that case.
730         if (0 == slen)
731             return ""w;
732         r.reserve(slen);
733     }
734     for (size_t i = 0; i < slen; )
735     {
736         dchar c = s[i];
737         if (c <= 0x7F)
738         {
739             i++;
740             r ~= cast(wchar)c;
741         }
742         else
743         {
744             c = decode(s, i);
745             encode(r, c);
746         }
747     }
748     return cast(wstring)r;
749 }
750 
751 alias const(wchar)* wptr;
752 /** ditto */
753 @safe pure
toUTF16z(const scope char[]s)754 wptr toUTF16z(const scope char[] s)
755 {
756     wchar[] r;
757     size_t slen = s.length;
758 
759     if (!__ctfe)
760     {
761         // Reserve still does a lot if slen is zero.
762         // Return early for that case.
763         if (0 == slen)
764             return &"\0"w[0];
765         r.reserve(slen + 1);
766     }
767     for (size_t i = 0; i < slen; )
768     {
769         dchar c = s[i];
770         if (c <= 0x7F)
771         {
772             i++;
773             r ~= cast(wchar)c;
774         }
775         else
776         {
777             c = decode(s, i);
778             encode(r, c);
779         }
780     }
781     r ~= '\000';
782     return &r[0];
783 }
784 
785 /** ditto */
786 @safe pure nothrow
toUTF16(return scope wstring s)787 wstring toUTF16(return scope wstring s)
788     in
789     {
790         validate(s);
791     }
792     do
793     {
794         return s;
795     }
796 
797 /** ditto */
798 @trusted pure nothrow
toUTF16(const scope dchar[]s)799 wstring toUTF16(const scope dchar[] s)
800 {
801     wchar[] r;
802     size_t slen = s.length;
803 
804     if (!__ctfe)
805     {
806         // Reserve still does a lot if slen is zero.
807         // Return early for that case.
808         if (0 == slen)
809             return ""w;
810         r.reserve(slen);
811     }
812     for (size_t i = 0; i < slen; i++)
813     {
814         encode(r, s[i]);
815     }
816     return cast(wstring)r;
817 }
818 
819 /* =================== Conversion to UTF32 ======================= */
820 
821 /*****
822  * Encodes string s into UTF-32 and returns the encoded string.
823  */
824 @trusted pure
toUTF32(const scope char[]s)825 dstring toUTF32(const scope char[] s)
826 {
827     dchar[] r;
828     size_t slen = s.length;
829     size_t j = 0;
830 
831     r.length = slen;            // r[] will never be longer than s[]
832     for (size_t i = 0; i < slen; )
833     {
834         dchar c = s[i];
835         if (c >= 0x80)
836             c = decode(s, i);
837         else
838             i++;                // c is ascii, no need for decode
839         r[j++] = c;
840     }
841     return cast(dstring)r[0 .. j];
842 }
843 
844 /** ditto */
845 @trusted pure
toUTF32(const scope wchar[]s)846 dstring toUTF32(const scope wchar[] s)
847 {
848     dchar[] r;
849     size_t slen = s.length;
850     size_t j = 0;
851 
852     r.length = slen;            // r[] will never be longer than s[]
853     for (size_t i = 0; i < slen; )
854     {
855         dchar c = s[i];
856         if (c >= 0x80)
857             c = decode(s, i);
858         else
859             i++;                // c is ascii, no need for decode
860         r[j++] = c;
861     }
862     return cast(dstring)r[0 .. j];
863 }
864 
865 /** ditto */
866 @safe pure nothrow
toUTF32(return scope dstring s)867 dstring toUTF32(return scope dstring s)
868     in
869     {
870         validate(s);
871     }
872     do
873     {
874         return s;
875     }
876 
877 /* ================================ tests ================================== */
878 
879 unittest
880 {
881     debug(utf) printf("utf.toUTF.unittest\n");
882 
883     auto c = "hello"c[];
884     auto w = toUTF16(c);
885     assert(w == "hello");
886     auto d = toUTF32(c);
887     assert(d == "hello");
888 
889     c = toUTF8(w);
890     assert(c == "hello");
891     d = toUTF32(w);
892     assert(d == "hello");
893 
894     c = toUTF8(d);
895     assert(c == "hello");
896     w = toUTF16(d);
897     assert(w == "hello");
898 
899 
900     c = "hel\u1234o";
901     w = toUTF16(c);
902     assert(w == "hel\u1234o");
903     d = toUTF32(c);
904     assert(d == "hel\u1234o");
905 
906     c = toUTF8(w);
907     assert(c == "hel\u1234o");
908     d = toUTF32(w);
909     assert(d == "hel\u1234o");
910 
911     c = toUTF8(d);
912     assert(c == "hel\u1234o");
913     w = toUTF16(d);
914     assert(w == "hel\u1234o");
915 
916 
917     c = "he\U000BAAAAllo";
918     w = toUTF16(c);
919     //foreach (wchar c; w) printf("c = x%x\n", c);
920     //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c);
921     assert(w == "he\U000BAAAAllo");
922     d = toUTF32(c);
923     assert(d == "he\U000BAAAAllo");
924 
925     c = toUTF8(w);
926     assert(c == "he\U000BAAAAllo");
927     d = toUTF32(w);
928     assert(d == "he\U000BAAAAllo");
929 
930     c = toUTF8(d);
931     assert(c == "he\U000BAAAAllo");
932     w = toUTF16(d);
933     assert(w == "he\U000BAAAAllo");
934 
935     wchar[2] buf;
936     auto ret = toUTF16(buf, '\U000BAAAA');
937     assert(ret == "\U000BAAAA");
938 }
939