xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/d/dmd/utf.c (revision 627f7eb200a4419d89b531d55fccd2ee3ffdcde0)
1 
2 /* Compiler implementation of the D programming language
3  * Copyright (C) 2003-2019 by The D Language Foundation, All Rights Reserved
4  * written by Walter Bright
5  * http://www.digitalmars.com
6  * Distributed under the Boost Software License, Version 1.0.
7  * http://www.boost.org/LICENSE_1_0.txt
8  * https://github.com/D-Programming-Language/dmd/blob/master/src/utf.c
9  */
10 
11 /// Description of UTF-8 in [1].  Unicode non-characters and private-use
12 /// code points described in [2],[4].
13 ///
14 /// References:
15 /// [1] http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
16 /// [2] http://en.wikipedia.org/wiki/Unicode
17 /// [3] http://unicode.org/faq/utf_bom.html
18 /// [4] http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
19 
20 #include "utf.h"
21 
22 /* The following encodings are valid, except for the 5 and 6 byte
23  * combinations:
24  *      0xxxxxxx
25  *      110xxxxx 10xxxxxx
26  *      1110xxxx 10xxxxxx 10xxxxxx
27  *      11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
28  *      111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
29  *      1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
30  */
31 const unsigned UTF8_STRIDE[256] =
32 {
33     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
34     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
35     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
36     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
37     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
38     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
39     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
40     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
41     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
42     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
43     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
44     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
45     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
46     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
47     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
48     4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
49 };
50 
51 // UTF-8 decoding errors
52 char const UTF8_DECODE_OUTSIDE_CODE_SPACE[] = "Outside Unicode code space";
53 char const UTF8_DECODE_TRUNCATED_SEQUENCE[] = "Truncated UTF-8 sequence";
54 char const UTF8_DECODE_OVERLONG[]           = "Overlong UTF-8 sequence";
55 char const UTF8_DECODE_INVALID_TRAILER[]    = "Invalid trailing code unit";
56 char const UTF8_DECODE_INVALID_CODE_POINT[] = "Invalid code point decoded";
57 
58 // UTF-16 decoding errors
59 char const UTF16_DECODE_TRUNCATED_SEQUENCE[]= "Truncated UTF-16 sequence";
60 char const UTF16_DECODE_INVALID_SURROGATE[] = "Invalid low surrogate";
61 char const UTF16_DECODE_UNPAIRED_SURROGATE[]= "Unpaired surrogate";
62 char const UTF16_DECODE_INVALID_CODE_POINT[]= "Invalid code point decoded";
63 
64 /// The Unicode code space is the range of code points [0x000000,0x10FFFF]
65 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF]
66 /// and non-characters (which end in 0xFFFE or 0xFFFF).
utf_isValidDchar(dchar_t c)67 bool utf_isValidDchar(dchar_t c)
68 {
69     // TODO: Whether non-char code points should be rejected is pending review
70     // largest character code point
71     if (c > 0x10FFFF)
72         return false;
73     // surrogate pairs
74     if (0xD800 <= c && c <= 0xDFFF)
75         return false;
76     // non-characters
77     if ((c & 0xFFFFFE) == 0x00FFFE)
78         return false;
79     return true;
80 }
81 
82 /*******************************
83  * Return !=0 if unicode alpha.
84  * Use table from C99 Appendix D.
85  */
86 
isUniAlpha(dchar_t c)87 bool isUniAlpha(dchar_t c)
88 {
89     size_t high = ALPHA_TABLE_LENGTH - 1;
90     // Shortcut search if c is out of range
91     size_t low
92         = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0;
93     // Binary search
94     while (low <= high)
95     {
96         size_t mid = (low + high) >> 1;
97         if (c < ALPHA_TABLE[mid][0])
98             high = mid - 1;
99         else if (ALPHA_TABLE[mid][1] < c)
100             low = mid + 1;
101         else
102         {
103             assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]);
104             return true;
105         }
106     }
107     return false;
108 }
109 
110 /**
111  * Returns the code length of c in code units.
112  */
113 
utf_codeLengthChar(dchar_t c)114 int utf_codeLengthChar(dchar_t c)
115 {
116   if (c <= 0x7F)
117       return 1;
118   if (c <= 0x7FF)
119       return 2;
120   if (c <= 0xFFFF)
121       return 3;
122   if (c <= 0x10FFFF)
123       return 4;
124   assert(false);
125 }
126 
utf_codeLengthWchar(dchar_t c)127 int utf_codeLengthWchar(dchar_t c)
128 {
129     return c <= 0xFFFF ? 1 : 2;
130 }
131 
132 /**
133  * Returns the code length of c in code units for the encoding.
134  * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
135  */
136 
utf_codeLength(int sz,dchar_t c)137 int utf_codeLength(int sz, dchar_t c)
138 {
139     if (sz == 1)
140         return utf_codeLengthChar(c);
141     if (sz == 2)
142         return utf_codeLengthWchar(c);
143     assert(sz == 4);
144     return 1;
145 }
146 
utf_encodeChar(utf8_t * s,dchar_t c)147 void utf_encodeChar(utf8_t *s, dchar_t c)
148 {
149     assert(s != NULL);
150     assert(utf_isValidDchar(c));
151     if (c <= 0x7F)
152     {
153         s[0] = static_cast<utf8_t>(c);
154     }
155     else if (c <= 0x07FF)
156     {
157         s[0] = static_cast<utf8_t>(0xC0 | (c >> 6));
158         s[1] = static_cast<utf8_t>(0x80 | (c & 0x3F));
159     }
160     else if (c <= 0xFFFF)
161     {
162         s[0] = static_cast<utf8_t>(0xE0 | (c >> 12));
163         s[1] = static_cast<utf8_t>(0x80 | ((c >> 6) & 0x3F));
164         s[2] = static_cast<utf8_t>(0x80 | (c & 0x3F));
165     }
166     else if (c <= 0x10FFFF)
167     {
168         s[0] = static_cast<utf8_t>(0xF0 | (c >> 18));
169         s[1] = static_cast<utf8_t>(0x80 | ((c >> 12) & 0x3F));
170         s[2] = static_cast<utf8_t>(0x80 | ((c >> 6) & 0x3F));
171         s[3] = static_cast<utf8_t>(0x80 | (c & 0x3F));
172     }
173     else
174         assert(0);
175 }
176 
utf_encodeWchar(utf16_t * s,dchar_t c)177 void utf_encodeWchar(utf16_t *s, dchar_t c)
178 {
179     assert(s != NULL);
180     assert(utf_isValidDchar(c));
181     if (c <= 0xFFFF)
182     {
183         s[0] = static_cast<utf16_t>(c);
184     }
185     else
186     {
187         s[0] = static_cast<utf16_t>((((c - 0x010000) >> 10) & 0x03FF) + 0xD800);
188         s[1] = static_cast<utf16_t>(((c - 0x010000) & 0x03FF) + 0xDC00);
189     }
190 }
191 
utf_encode(int sz,void * s,dchar_t c)192 void utf_encode(int sz, void *s, dchar_t c)
193 {
194     if (sz == 1)
195         utf_encodeChar((utf8_t *)s, c);
196     else if (sz == 2)
197         utf_encodeWchar((utf16_t *)s, c);
198     else
199     {
200         assert(sz == 4);
201         *((utf32_t *)s) = c;
202     }
203 }
204 
205 /********************************************
206  * Decode a UTF-8 sequence as a single UTF-32 code point.
207  * Returns:
208  *      NULL    success
209  *      !=NULL  error message string
210  */
211 
utf_decodeChar(utf8_t const * s,size_t len,size_t * pidx,dchar_t * presult)212 const char *utf_decodeChar(utf8_t const *s, size_t len, size_t *pidx, dchar_t *presult)
213 {
214     assert(s != NULL);
215     assert(pidx != NULL);
216     assert(presult != NULL);
217     size_t i = (*pidx)++;
218     assert(i < len);
219     utf8_t u = s[i];
220     // Pre-stage results for ASCII and error cases
221     *presult = u;
222 
223     //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);
224 
225     // Get expected sequence length
226     size_t n = UTF8_STRIDE[u];
227     switch (n)
228     {
229     case 1:                             // ASCII
230         return UTF8_DECODE_OK;
231     case 2: case 3: case 4:             // multi-byte UTF-8
232         break;
233     default:                            // 5- or 6-byte sequence
234         return UTF8_DECODE_OUTSIDE_CODE_SPACE;
235     }
236     if (len < i + n)                    // source too short
237         return UTF8_DECODE_TRUNCATED_SEQUENCE;
238 
239     // Pick off 7 - n low bits from first code unit
240     utf32_t c = u & ((1 << (7 - n)) - 1);
241     /* The following combinations are overlong, and illegal:
242      *      1100000x (10xxxxxx)
243      *      11100000 100xxxxx (10xxxxxx)
244      *      11110000 1000xxxx (10xxxxxx 10xxxxxx)
245      *      11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
246      *      11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
247      */
248     utf8_t u2 = s[++i];
249     // overlong combination
250     if ((u & 0xFE) == 0xC0 ||
251         (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
252         (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
253         (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
254         (u == 0xFC && (u2 & 0xFC) == 0x80))
255         return UTF8_DECODE_OVERLONG;
256     // Decode remaining bits
257     for (n += i - 1; i != n; ++i)
258     {
259         u = s[i];
260         if ((u & 0xC0) != 0x80)         // trailing bytes are 10xxxxxx
261             return UTF8_DECODE_INVALID_TRAILER;
262         c = (c << 6) | (u & 0x3F);
263     }
264     if (!utf_isValidDchar(c))
265         return UTF8_DECODE_INVALID_CODE_POINT;
266     *pidx = i;
267     *presult = c;
268     return UTF8_DECODE_OK;
269 }
270 
271 /********************************************
272  * Decode a UTF-16 sequence as a single UTF-32 code point.
273  * Returns:
274  *      NULL    success
275  *      !=NULL  error message string
276  */
277 
utf_decodeWchar(utf16_t const * s,size_t len,size_t * pidx,dchar_t * presult)278 const char *utf_decodeWchar(utf16_t const *s, size_t len, size_t *pidx, dchar_t *presult)
279 {
280     assert(s != NULL);
281     assert(pidx != NULL);
282     assert(presult != NULL);
283     size_t i = (*pidx)++;
284     assert(i < len);
285     // Pre-stage results for ASCII and error cases
286     utf32_t u = *presult = s[i];
287 
288     if (u < 0x80)                       // ASCII
289         return UTF16_DECODE_OK;
290     if (0xD800 <= u && u <= 0xDBFF)     // Surrogate pair
291     {   if (len <= i + 1)
292             return UTF16_DECODE_TRUNCATED_SEQUENCE;
293         utf16_t u2 = s[i + 1];
294         if (u2 < 0xDC00 || 0xDFFF < u)
295             return UTF16_DECODE_INVALID_SURROGATE;
296         u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
297         ++*pidx;
298     }
299     else if (0xDC00 <= u && u <= 0xDFFF)
300         return UTF16_DECODE_UNPAIRED_SURROGATE;
301     if (!utf_isValidDchar(u))
302         return UTF16_DECODE_INVALID_CODE_POINT;
303     *presult = u;
304     return UTF16_DECODE_OK;
305 }
306