1 //===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----=== 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "llvm/ADT/ArrayRef.h" 10 #include "llvm/ADT/StringRef.h" 11 #include "llvm/Support/ConvertUTF.h" 12 #include "llvm/Support/ErrorHandling.h" 13 #include <string> 14 #include <vector> 15 16 namespace llvm { 17 18 bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source, 19 char *&ResultPtr, const UTF8 *&ErrorPtr) { 20 assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4); 21 ConversionResult result = conversionOK; 22 // Copy the character span over. 23 if (WideCharWidth == 1) { 24 const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin()); 25 if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) { 26 result = sourceIllegal; 27 ErrorPtr = Pos; 28 } else { 29 memcpy(ResultPtr, Source.data(), Source.size()); 30 ResultPtr += Source.size(); 31 } 32 } else if (WideCharWidth == 2) { 33 const UTF8 *sourceStart = (const UTF8*)Source.data(); 34 // FIXME: Make the type of the result buffer correct instead of 35 // using reinterpret_cast. 36 UTF16 *targetStart = reinterpret_cast<UTF16 *>(ResultPtr); 37 ConversionFlags flags = strictConversion; 38 result = 39 ConvertUTF8toUTF16(&sourceStart, sourceStart + Source.size(), 40 &targetStart, targetStart + Source.size(), flags); 41 if (result == conversionOK) 42 ResultPtr = reinterpret_cast<char *>(targetStart); 43 else 44 ErrorPtr = sourceStart; 45 } else if (WideCharWidth == 4) { 46 const UTF8 *sourceStart = (const UTF8 *)Source.data(); 47 // FIXME: Make the type of the result buffer correct instead of 48 // using reinterpret_cast. 49 UTF32 *targetStart = reinterpret_cast<UTF32 *>(ResultPtr); 50 ConversionFlags flags = strictConversion; 51 result = 52 ConvertUTF8toUTF32(&sourceStart, sourceStart + Source.size(), 53 &targetStart, targetStart + Source.size(), flags); 54 if (result == conversionOK) 55 ResultPtr = reinterpret_cast<char *>(targetStart); 56 else 57 ErrorPtr = sourceStart; 58 } 59 assert((result != targetExhausted) && 60 "ConvertUTF8toUTFXX exhausted target buffer"); 61 return result == conversionOK; 62 } 63 64 bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) { 65 const UTF32 *SourceStart = &Source; 66 const UTF32 *SourceEnd = SourceStart + 1; 67 UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr); 68 UTF8 *TargetEnd = TargetStart + 4; 69 ConversionResult CR = ConvertUTF32toUTF8( 70 &SourceStart, SourceEnd, &TargetStart, TargetEnd, strictConversion); 71 if (CR != conversionOK) 72 return false; 73 74 ResultPtr = reinterpret_cast<char *>(TargetStart); 75 return true; 76 } 77 78 bool hasUTF16ByteOrderMark(ArrayRef<char> S) { 79 return (S.size() >= 2 && ((S[0] == '\xff' && S[1] == '\xfe') || 80 (S[0] == '\xfe' && S[1] == '\xff'))); 81 } 82 83 bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) { 84 assert(Out.empty()); 85 86 // Error out on an uneven byte count. 87 if (SrcBytes.size() % 2) 88 return false; 89 90 // Avoid OOB by returning early on empty input. 91 if (SrcBytes.empty()) 92 return true; 93 94 const UTF16 *Src = reinterpret_cast<const UTF16 *>(SrcBytes.begin()); 95 const UTF16 *SrcEnd = reinterpret_cast<const UTF16 *>(SrcBytes.end()); 96 97 assert((uintptr_t)Src % sizeof(UTF16) == 0); 98 99 // Byteswap if necessary. 100 std::vector<UTF16> ByteSwapped; 101 if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) { 102 ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd); 103 for (UTF16 &I : ByteSwapped) 104 I = llvm::byteswap<uint16_t>(I); 105 Src = &ByteSwapped[0]; 106 SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1; 107 } 108 109 // Skip the BOM for conversion. 110 if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE) 111 Src++; 112 113 // Just allocate enough space up front. We'll shrink it later. Allocate 114 // enough that we can fit a null terminator without reallocating. 115 Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1); 116 UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]); 117 UTF8 *DstEnd = Dst + Out.size(); 118 119 ConversionResult CR = 120 ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion); 121 assert(CR != targetExhausted); 122 123 if (CR != conversionOK) { 124 Out.clear(); 125 return false; 126 } 127 128 Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]); 129 Out.push_back(0); 130 Out.pop_back(); 131 return true; 132 } 133 134 bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out) { 135 return convertUTF16ToUTF8String( 136 llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()), 137 Src.size() * sizeof(UTF16)), 138 Out); 139 } 140 141 bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) { 142 assert(Out.empty()); 143 144 // Error out on an uneven byte count. 145 if (SrcBytes.size() % 4) 146 return false; 147 148 // Avoid OOB by returning early on empty input. 149 if (SrcBytes.empty()) 150 return true; 151 152 const UTF32 *Src = reinterpret_cast<const UTF32 *>(SrcBytes.begin()); 153 const UTF32 *SrcEnd = reinterpret_cast<const UTF32 *>(SrcBytes.end()); 154 155 assert((uintptr_t)Src % sizeof(UTF32) == 0); 156 157 // Byteswap if necessary. 158 std::vector<UTF32> ByteSwapped; 159 if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_SWAPPED) { 160 ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd); 161 for (UTF32 &I : ByteSwapped) 162 I = llvm::byteswap<uint32_t>(I); 163 Src = &ByteSwapped[0]; 164 SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1; 165 } 166 167 // Skip the BOM for conversion. 168 if (Src[0] == UNI_UTF32_BYTE_ORDER_MARK_NATIVE) 169 Src++; 170 171 // Just allocate enough space up front. We'll shrink it later. Allocate 172 // enough that we can fit a null terminator without reallocating. 173 Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1); 174 UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]); 175 UTF8 *DstEnd = Dst + Out.size(); 176 177 ConversionResult CR = 178 ConvertUTF32toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion); 179 assert(CR != targetExhausted); 180 181 if (CR != conversionOK) { 182 Out.clear(); 183 return false; 184 } 185 186 Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]); 187 Out.push_back(0); 188 Out.pop_back(); 189 return true; 190 } 191 192 bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out) { 193 return convertUTF32ToUTF8String( 194 llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()), 195 Src.size() * sizeof(UTF32)), 196 Out); 197 } 198 199 bool convertUTF8ToUTF16String(StringRef SrcUTF8, 200 SmallVectorImpl<UTF16> &DstUTF16) { 201 assert(DstUTF16.empty()); 202 203 // Avoid OOB by returning early on empty input. 204 if (SrcUTF8.empty()) { 205 DstUTF16.push_back(0); 206 DstUTF16.pop_back(); 207 return true; 208 } 209 210 const UTF8 *Src = reinterpret_cast<const UTF8 *>(SrcUTF8.begin()); 211 const UTF8 *SrcEnd = reinterpret_cast<const UTF8 *>(SrcUTF8.end()); 212 213 // Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding 214 // as UTF-16 should always require the same amount or less code units than the 215 // UTF-8 encoding. Allocate one extra byte for the null terminator though, 216 // so that someone calling DstUTF16.data() gets a null terminated string. 217 // We resize down later so we don't have to worry that this over allocates. 218 DstUTF16.resize(SrcUTF8.size()+1); 219 UTF16 *Dst = &DstUTF16[0]; 220 UTF16 *DstEnd = Dst + DstUTF16.size(); 221 222 ConversionResult CR = 223 ConvertUTF8toUTF16(&Src, SrcEnd, &Dst, DstEnd, strictConversion); 224 assert(CR != targetExhausted); 225 226 if (CR != conversionOK) { 227 DstUTF16.clear(); 228 return false; 229 } 230 231 DstUTF16.resize(Dst - &DstUTF16[0]); 232 DstUTF16.push_back(0); 233 DstUTF16.pop_back(); 234 return true; 235 } 236 237 static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 || 238 sizeof(wchar_t) == 4, 239 "Expected wchar_t to be 1, 2, or 4 bytes"); 240 241 template <typename TResult> 242 static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source, 243 TResult &Result) { 244 // Even in the case of UTF-16, the number of bytes in a UTF-8 string is 245 // at least as large as the number of elements in the resulting wide 246 // string, because surrogate pairs take at least 4 bytes in UTF-8. 247 Result.resize(Source.size() + 1); 248 char *ResultPtr = reinterpret_cast<char *>(&Result[0]); 249 const UTF8 *ErrorPtr; 250 if (!ConvertUTF8toWide(sizeof(wchar_t), Source, ResultPtr, ErrorPtr)) { 251 Result.clear(); 252 return false; 253 } 254 Result.resize(reinterpret_cast<wchar_t *>(ResultPtr) - &Result[0]); 255 return true; 256 } 257 258 bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result) { 259 return ConvertUTF8toWideInternal(Source, Result); 260 } 261 262 bool ConvertUTF8toWide(const char *Source, std::wstring &Result) { 263 if (!Source) { 264 Result.clear(); 265 return true; 266 } 267 return ConvertUTF8toWide(llvm::StringRef(Source), Result); 268 } 269 270 bool convertWideToUTF8(const std::wstring &Source, std::string &Result) { 271 if (sizeof(wchar_t) == 1) { 272 const UTF8 *Start = reinterpret_cast<const UTF8 *>(Source.data()); 273 const UTF8 *End = 274 reinterpret_cast<const UTF8 *>(Source.data() + Source.size()); 275 if (!isLegalUTF8String(&Start, End)) 276 return false; 277 Result.resize(Source.size()); 278 memcpy(&Result[0], Source.data(), Source.size()); 279 return true; 280 } else if (sizeof(wchar_t) == 2) { 281 return convertUTF16ToUTF8String( 282 llvm::ArrayRef<UTF16>(reinterpret_cast<const UTF16 *>(Source.data()), 283 Source.size()), 284 Result); 285 } else if (sizeof(wchar_t) == 4) { 286 const UTF32 *Start = reinterpret_cast<const UTF32 *>(Source.data()); 287 const UTF32 *End = 288 reinterpret_cast<const UTF32 *>(Source.data() + Source.size()); 289 Result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * Source.size()); 290 UTF8 *ResultPtr = reinterpret_cast<UTF8 *>(&Result[0]); 291 UTF8 *ResultEnd = reinterpret_cast<UTF8 *>(&Result[0] + Result.size()); 292 if (ConvertUTF32toUTF8(&Start, End, &ResultPtr, ResultEnd, 293 strictConversion) == conversionOK) { 294 Result.resize(reinterpret_cast<char *>(ResultPtr) - &Result[0]); 295 return true; 296 } else { 297 Result.clear(); 298 return false; 299 } 300 } else { 301 llvm_unreachable( 302 "Control should never reach this point; see static_assert further up"); 303 } 304 } 305 306 } // end namespace llvm 307 308