xref: /freebsd-src/contrib/llvm-project/llvm/include/llvm/Support/ConvertUTF.h (revision 8cc087a1eee9ec1ca9f7ac1e63ad51bdb5a682eb)
1 /*===--- ConvertUTF.h - Universal Character Names conversions ---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *==------------------------------------------------------------------------==*/
8 /*
9  * Copyright 2001-2004 Unicode, Inc.
10  *
11  * Disclaimer
12  *
13  * This source code is provided as is by Unicode, Inc. No claims are
14  * made as to fitness for any particular purpose. No warranties of any
15  * kind are expressed or implied. The recipient agrees to determine
16  * applicability of information provided. If this file has been
17  * purchased on magnetic or optical media from Unicode, Inc., the
18  * sole remedy for any claim will be exchange of defective media
19  * within 90 days of receipt.
20  *
21  * Limitations on Rights to Redistribute This Code
22  *
23  * Unicode, Inc. hereby grants the right to freely use the information
24  * supplied in this file in the creation of products supporting the
25  * Unicode Standard, and to make copies of this file in any form
26  * for internal or external distribution as long as this notice
27  * remains attached.
28  */
29 
30 /* ---------------------------------------------------------------------
31 
32     Conversions between UTF32, UTF-16, and UTF-8.  Header file.
33 
34     Several funtions are included here, forming a complete set of
35     conversions between the three formats.  UTF-7 is not included
36     here, but is handled in a separate source file.
37 
38     Each of these routines takes pointers to input buffers and output
39     buffers.  The input buffers are const.
40 
41     Each routine converts the text between *sourceStart and sourceEnd,
42     putting the result into the buffer between *targetStart and
43     targetEnd. Note: the end pointers are *after* the last item: e.g.
44     *(sourceEnd - 1) is the last item.
45 
46     The return result indicates whether the conversion was successful,
47     and if not, whether the problem was in the source or target buffers.
48     (Only the first encountered problem is indicated.)
49 
50     After the conversion, *sourceStart and *targetStart are both
51     updated to point to the end of last text successfully converted in
52     the respective buffers.
53 
54     Input parameters:
55         sourceStart - pointer to a pointer to the source buffer.
56                 The contents of this are modified on return so that
57                 it points at the next thing to be converted.
58         targetStart - similarly, pointer to pointer to the target buffer.
59         sourceEnd, targetEnd - respectively pointers to the ends of the
60                 two buffers, for overflow checking only.
61 
62     These conversion functions take a ConversionFlags argument. When this
63     flag is set to strict, both irregular sequences and isolated surrogates
64     will cause an error.  When the flag is set to lenient, both irregular
65     sequences and isolated surrogates are converted.
66 
67     Whether the flag is strict or lenient, all illegal sequences will cause
68     an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
69     or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
70     must check for illegal sequences.
71 
72     When the flag is set to lenient, characters over 0x10FFFF are converted
73     to the replacement character; otherwise (when the flag is set to strict)
74     they constitute an error.
75 
76     Output parameters:
77         The value "sourceIllegal" is returned from some routines if the input
78         sequence is malformed.  When "sourceIllegal" is returned, the source
79         value will point to the illegal value that caused the problem. E.g.,
80         in UTF-8 when a sequence is malformed, it points to the start of the
81         malformed sequence.
82 
83     Author: Mark E. Davis, 1994.
84     Rev History: Rick McGowan, fixes & updates May 2001.
85          Fixes & updates, Sept 2001.
86 
87 ------------------------------------------------------------------------ */
88 
89 #ifndef LLVM_SUPPORT_CONVERTUTF_H
90 #define LLVM_SUPPORT_CONVERTUTF_H
91 
92 #include <cstddef>
93 #include <string>
94 
95 #if defined(_WIN32)
96 #include <system_error>
97 #endif
98 
99 // Wrap everything in namespace llvm so that programs can link with llvm and
100 // their own version of the unicode libraries.
101 
102 namespace llvm {
103 
104 /* ---------------------------------------------------------------------
105     The following 4 definitions are compiler-specific.
106     The C standard does not guarantee that wchar_t has at least
107     16 bits, so wchar_t is no less portable than unsigned short!
108     All should be unsigned values to avoid sign extension during
109     bit mask & shift operations.
110 ------------------------------------------------------------------------ */
111 
112 typedef unsigned int    UTF32;  /* at least 32 bits */
113 typedef unsigned short  UTF16;  /* at least 16 bits */
114 typedef unsigned char   UTF8;   /* typically 8 bits */
115 typedef unsigned char   Boolean; /* 0 or 1 */
116 
117 /* Some fundamental constants */
118 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
119 #define UNI_MAX_BMP (UTF32)0x0000FFFF
120 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
121 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
122 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
123 
124 #define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4
125 
126 #define UNI_UTF16_BYTE_ORDER_MARK_NATIVE  0xFEFF
127 #define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
128 
129 typedef enum {
130   conversionOK,           /* conversion successful */
131   sourceExhausted,        /* partial character in source, but hit end */
132   targetExhausted,        /* insuff. room in target for conversion */
133   sourceIllegal           /* source sequence is illegal/malformed */
134 } ConversionResult;
135 
136 typedef enum {
137   strictConversion = 0,
138   lenientConversion
139 } ConversionFlags;
140 
141 ConversionResult ConvertUTF8toUTF16 (
142   const UTF8** sourceStart, const UTF8* sourceEnd,
143   UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
144 
145 /**
146  * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
147  * incomplete code unit sequence, returns \c sourceExhausted.
148  */
149 ConversionResult ConvertUTF8toUTF32Partial(
150   const UTF8** sourceStart, const UTF8* sourceEnd,
151   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
152 
153 /**
154  * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
155  * incomplete code unit sequence, returns \c sourceIllegal.
156  */
157 ConversionResult ConvertUTF8toUTF32(
158   const UTF8** sourceStart, const UTF8* sourceEnd,
159   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
160 
161 ConversionResult ConvertUTF16toUTF8 (
162   const UTF16** sourceStart, const UTF16* sourceEnd,
163   UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
164 
165 ConversionResult ConvertUTF32toUTF8 (
166   const UTF32** sourceStart, const UTF32* sourceEnd,
167   UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
168 
169 ConversionResult ConvertUTF16toUTF32 (
170   const UTF16** sourceStart, const UTF16* sourceEnd,
171   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
172 
173 ConversionResult ConvertUTF32toUTF16 (
174   const UTF32** sourceStart, const UTF32* sourceEnd,
175   UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
176 
177 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
178 
179 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
180 
181 unsigned getNumBytesForUTF8(UTF8 firstByte);
182 
183 /*************************************************************************/
184 /* Below are LLVM-specific wrappers of the functions above. */
185 
186 template <typename T> class ArrayRef;
187 template <typename T> class SmallVectorImpl;
188 class StringRef;
189 
190 /**
191  * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on
192  * WideCharWidth. The converted data is written to ResultPtr, which needs to
193  * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success,
194  * ResultPtr will point one after the end of the copied string. On failure,
195  * ResultPtr will not be changed, and ErrorPtr will be set to the location of
196  * the first character which could not be converted.
197  * \return true on success.
198  */
199 bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
200                        char *&ResultPtr, const UTF8 *&ErrorPtr);
201 
202 /**
203 * Converts a UTF-8 StringRef to a std::wstring.
204 * \return true on success.
205 */
206 bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result);
207 
208 /**
209 * Converts a UTF-8 C-string to a std::wstring.
210 * \return true on success.
211 */
212 bool ConvertUTF8toWide(const char *Source, std::wstring &Result);
213 
214 /**
215 * Converts a std::wstring to a UTF-8 encoded std::string.
216 * \return true on success.
217 */
218 bool convertWideToUTF8(const std::wstring &Source, std::string &Result);
219 
220 
221 /**
222  * Convert an Unicode code point to UTF8 sequence.
223  *
224  * \param Source a Unicode code point.
225  * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least
226  * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes.  On success \c ResultPtr is
227  * updated one past end of the converted sequence.
228  *
229  * \returns true on success.
230  */
231 bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr);
232 
233 /**
234  * Convert the first UTF8 sequence in the given source buffer to a UTF32
235  * code point.
236  *
237  * \param [in,out] source A pointer to the source buffer. If the conversion
238  * succeeds, this pointer will be updated to point to the byte just past the
239  * end of the converted sequence.
240  * \param sourceEnd A pointer just past the end of the source buffer.
241  * \param [out] target The converted code
242  * \param flags Whether the conversion is strict or lenient.
243  *
244  * \returns conversionOK on success
245  *
246  * \sa ConvertUTF8toUTF32
247  */
248 inline ConversionResult convertUTF8Sequence(const UTF8 **source,
249                                             const UTF8 *sourceEnd,
250                                             UTF32 *target,
251                                             ConversionFlags flags) {
252   if (*source == sourceEnd)
253     return sourceExhausted;
254   unsigned size = getNumBytesForUTF8(**source);
255   if ((ptrdiff_t)size > sourceEnd - *source)
256     return sourceExhausted;
257   return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
258 }
259 
260 /**
261  * Returns true if a blob of text starts with a UTF-16 big or little endian byte
262  * order mark.
263  */
264 bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
265 
266 /**
267  * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string.
268  *
269  * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text.
270  * \param [out] Out Converted UTF-8 is stored here on success.
271  * \returns true on success
272  */
273 bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
274 
275 /**
276 * Converts a UTF16 string into a UTF8 std::string.
277 *
278 * \param [in] Src A buffer of UTF-16 encoded text.
279 * \param [out] Out Converted UTF-8 is stored here on success.
280 * \returns true on success
281 */
282 bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
283 
284 /**
285  * Converts a UTF-8 string into a UTF-16 string with native endianness.
286  *
287  * \returns true on success
288  */
289 bool convertUTF8ToUTF16String(StringRef SrcUTF8,
290                               SmallVectorImpl<UTF16> &DstUTF16);
291 
292 #if defined(_WIN32)
293 namespace sys {
294 namespace windows {
295 std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
296 /// Convert to UTF16 from the current code page used in the system
297 std::error_code CurCPToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
298 std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
299                             SmallVectorImpl<char> &utf8);
300 /// Convert from UTF16 to the current code page used in the system
301 std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
302                              SmallVectorImpl<char> &utf8);
303 } // namespace windows
304 } // namespace sys
305 #endif
306 
307 } /* end namespace llvm */
308 
309 #endif
310