1*81ad6265SDimitry Andric //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties 2*81ad6265SDimitry Andric //-*- C++ -*-===// 3*81ad6265SDimitry Andric // 4*81ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5*81ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 6*81ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7*81ad6265SDimitry Andric // 8*81ad6265SDimitry Andric //===----------------------------------------------------------------------===// 9*81ad6265SDimitry Andric // 10*81ad6265SDimitry Andric // This file implements functions to map the name or alias of a unicode 11*81ad6265SDimitry Andric // character to its codepoint. 12*81ad6265SDimitry Andric // 13*81ad6265SDimitry Andric //===----------------------------------------------------------------------===// 14*81ad6265SDimitry Andric 15*81ad6265SDimitry Andric #include "llvm/ADT/STLExtras.h" 16*81ad6265SDimitry Andric #include "llvm/ADT/StringExtras.h" 17*81ad6265SDimitry Andric #include "llvm/ADT/StringRef.h" 18*81ad6265SDimitry Andric #include "llvm/Support/Unicode.h" 19*81ad6265SDimitry Andric 20*81ad6265SDimitry Andric namespace llvm { 21*81ad6265SDimitry Andric namespace sys { 22*81ad6265SDimitry Andric namespace unicode { 23*81ad6265SDimitry Andric 24*81ad6265SDimitry Andric extern const char *UnicodeNameToCodepointDict; 25*81ad6265SDimitry Andric extern const uint8_t *UnicodeNameToCodepointIndex; 26*81ad6265SDimitry Andric extern const std::size_t UnicodeNameToCodepointIndexSize; 27*81ad6265SDimitry Andric extern const std::size_t UnicodeNameToCodepointLargestNameSize; 28*81ad6265SDimitry Andric 29*81ad6265SDimitry Andric using BufferType = SmallString<64>; 30*81ad6265SDimitry Andric 31*81ad6265SDimitry Andric struct Node { 32*81ad6265SDimitry Andric bool IsRoot = false; 33*81ad6265SDimitry Andric char32_t Value = 0xFFFFFFFF; 34*81ad6265SDimitry Andric uint32_t ChildrenOffset = 0; 35*81ad6265SDimitry Andric bool HasSibling = false; 36*81ad6265SDimitry Andric uint32_t Size = 0; 37*81ad6265SDimitry Andric StringRef Name; 38*81ad6265SDimitry Andric const Node *Parent = nullptr; 39*81ad6265SDimitry Andric 40*81ad6265SDimitry Andric constexpr bool isValid() const { 41*81ad6265SDimitry Andric return !Name.empty() || Value == 0xFFFFFFFF; 42*81ad6265SDimitry Andric } 43*81ad6265SDimitry Andric constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; } 44*81ad6265SDimitry Andric 45*81ad6265SDimitry Andric std::string fullName() const { 46*81ad6265SDimitry Andric std::string S; 47*81ad6265SDimitry Andric // Reserve enough space for most unicode code points. 48*81ad6265SDimitry Andric // The chosen value represent the 99th percentile of name size as of 49*81ad6265SDimitry Andric // Unicode 14. 50*81ad6265SDimitry Andric S.reserve(46); 51*81ad6265SDimitry Andric const Node *N = this; 52*81ad6265SDimitry Andric while (N) { 53*81ad6265SDimitry Andric std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S)); 54*81ad6265SDimitry Andric N = N->Parent; 55*81ad6265SDimitry Andric } 56*81ad6265SDimitry Andric std::reverse(S.begin(), S.end()); 57*81ad6265SDimitry Andric return S; 58*81ad6265SDimitry Andric } 59*81ad6265SDimitry Andric }; 60*81ad6265SDimitry Andric 61*81ad6265SDimitry Andric static Node createRoot() { 62*81ad6265SDimitry Andric Node N; 63*81ad6265SDimitry Andric N.IsRoot = true; 64*81ad6265SDimitry Andric N.ChildrenOffset = 1; 65*81ad6265SDimitry Andric N.Size = 1; 66*81ad6265SDimitry Andric return N; 67*81ad6265SDimitry Andric } 68*81ad6265SDimitry Andric 69*81ad6265SDimitry Andric static Node readNode(uint32_t Offset, const Node *Parent = nullptr) { 70*81ad6265SDimitry Andric if (Offset == 0) 71*81ad6265SDimitry Andric return createRoot(); 72*81ad6265SDimitry Andric 73*81ad6265SDimitry Andric uint32_t Origin = Offset; 74*81ad6265SDimitry Andric Node N; 75*81ad6265SDimitry Andric N.Parent = Parent; 76*81ad6265SDimitry Andric uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++]; 77*81ad6265SDimitry Andric if (Offset + 6 >= UnicodeNameToCodepointIndexSize) 78*81ad6265SDimitry Andric return N; 79*81ad6265SDimitry Andric 80*81ad6265SDimitry Andric bool LongName = NameInfo & 0x40; 81*81ad6265SDimitry Andric bool HasValue = NameInfo & 0x80; 82*81ad6265SDimitry Andric std::size_t Size = NameInfo & ~0xC0; 83*81ad6265SDimitry Andric if (LongName) { 84*81ad6265SDimitry Andric uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8); 85*81ad6265SDimitry Andric NameOffset |= UnicodeNameToCodepointIndex[Offset++]; 86*81ad6265SDimitry Andric N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size); 87*81ad6265SDimitry Andric } else { 88*81ad6265SDimitry Andric N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1); 89*81ad6265SDimitry Andric } 90*81ad6265SDimitry Andric if (HasValue) { 91*81ad6265SDimitry Andric uint8_t H = UnicodeNameToCodepointIndex[Offset++]; 92*81ad6265SDimitry Andric uint8_t M = UnicodeNameToCodepointIndex[Offset++]; 93*81ad6265SDimitry Andric uint8_t L = UnicodeNameToCodepointIndex[Offset++]; 94*81ad6265SDimitry Andric N.Value = ((H << 16) | (M << 8) | L) >> 3; 95*81ad6265SDimitry Andric 96*81ad6265SDimitry Andric bool HasChildren = L & 0x02; 97*81ad6265SDimitry Andric N.HasSibling = L & 0x01; 98*81ad6265SDimitry Andric 99*81ad6265SDimitry Andric if (HasChildren) { 100*81ad6265SDimitry Andric N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16; 101*81ad6265SDimitry Andric N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8; 102*81ad6265SDimitry Andric N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; 103*81ad6265SDimitry Andric } 104*81ad6265SDimitry Andric } else { 105*81ad6265SDimitry Andric uint8_t H = UnicodeNameToCodepointIndex[Offset++]; 106*81ad6265SDimitry Andric N.HasSibling = H & 0x80; 107*81ad6265SDimitry Andric bool HasChildren = H & 0x40; 108*81ad6265SDimitry Andric H &= ~0xC0; 109*81ad6265SDimitry Andric if (HasChildren) { 110*81ad6265SDimitry Andric N.ChildrenOffset = (H << 16); 111*81ad6265SDimitry Andric N.ChildrenOffset |= 112*81ad6265SDimitry Andric (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8); 113*81ad6265SDimitry Andric N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; 114*81ad6265SDimitry Andric } 115*81ad6265SDimitry Andric } 116*81ad6265SDimitry Andric N.Size = Offset - Origin; 117*81ad6265SDimitry Andric return N; 118*81ad6265SDimitry Andric } 119*81ad6265SDimitry Andric 120*81ad6265SDimitry Andric static bool startsWith(StringRef Name, StringRef Needle, bool Strict, 121*81ad6265SDimitry Andric std::size_t &Consummed, char &PreviousCharInName, 122*81ad6265SDimitry Andric char &PreviousCharInNeedle, bool IsPrefix = false) { 123*81ad6265SDimitry Andric 124*81ad6265SDimitry Andric Consummed = 0; 125*81ad6265SDimitry Andric if (Strict) { 126*81ad6265SDimitry Andric if (!Name.startswith(Needle)) 127*81ad6265SDimitry Andric return false; 128*81ad6265SDimitry Andric Consummed = Needle.size(); 129*81ad6265SDimitry Andric return true; 130*81ad6265SDimitry Andric } 131*81ad6265SDimitry Andric if (Needle.empty()) 132*81ad6265SDimitry Andric return true; 133*81ad6265SDimitry Andric 134*81ad6265SDimitry Andric auto NamePos = Name.begin(); 135*81ad6265SDimitry Andric auto NeedlePos = Needle.begin(); 136*81ad6265SDimitry Andric 137*81ad6265SDimitry Andric char PreviousCharInNameOrigin = PreviousCharInName; 138*81ad6265SDimitry Andric char PreviousCharInNeedleOrigin = PreviousCharInNeedle; 139*81ad6265SDimitry Andric 140*81ad6265SDimitry Andric auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar, 141*81ad6265SDimitry Andric bool IgnoreEnd = false) { 142*81ad6265SDimitry Andric while (It != End) { 143*81ad6265SDimitry Andric const auto Next = std::next(It); 144*81ad6265SDimitry Andric // Ignore spaces, underscore, medial hyphens 145*81ad6265SDimitry Andric // https://unicode.org/reports/tr44/#UAX44-LM2. 146*81ad6265SDimitry Andric bool Ignore = 147*81ad6265SDimitry Andric *It == ' ' || *It == '_' || 148*81ad6265SDimitry Andric (*It == '-' && isAlnum(PreviousChar) && 149*81ad6265SDimitry Andric ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd))); 150*81ad6265SDimitry Andric PreviousChar = *It; 151*81ad6265SDimitry Andric if (!Ignore) 152*81ad6265SDimitry Andric break; 153*81ad6265SDimitry Andric ++It; 154*81ad6265SDimitry Andric } 155*81ad6265SDimitry Andric return It; 156*81ad6265SDimitry Andric }; 157*81ad6265SDimitry Andric 158*81ad6265SDimitry Andric while (true) { 159*81ad6265SDimitry Andric NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName); 160*81ad6265SDimitry Andric NeedlePos = 161*81ad6265SDimitry Andric IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix); 162*81ad6265SDimitry Andric if (NeedlePos == Needle.end()) 163*81ad6265SDimitry Andric break; 164*81ad6265SDimitry Andric if (NamePos == Name.end()) 165*81ad6265SDimitry Andric break; 166*81ad6265SDimitry Andric if (toUpper(*NeedlePos) != toUpper(*NamePos)) 167*81ad6265SDimitry Andric break; 168*81ad6265SDimitry Andric NeedlePos++; 169*81ad6265SDimitry Andric NamePos++; 170*81ad6265SDimitry Andric } 171*81ad6265SDimitry Andric Consummed = std::distance(Name.begin(), NamePos); 172*81ad6265SDimitry Andric if (NeedlePos != Needle.end()) { 173*81ad6265SDimitry Andric PreviousCharInName = PreviousCharInNameOrigin; 174*81ad6265SDimitry Andric PreviousCharInNeedle = PreviousCharInNeedleOrigin; 175*81ad6265SDimitry Andric } 176*81ad6265SDimitry Andric return NeedlePos == Needle.end(); 177*81ad6265SDimitry Andric } 178*81ad6265SDimitry Andric 179*81ad6265SDimitry Andric static std::tuple<Node, bool, uint32_t> 180*81ad6265SDimitry Andric compareNode(uint32_t Offset, StringRef Name, bool Strict, 181*81ad6265SDimitry Andric char PreviousCharInName, char PreviousCharInNeedle, 182*81ad6265SDimitry Andric BufferType &Buffer, const Node *Parent = nullptr) { 183*81ad6265SDimitry Andric Node N = readNode(Offset, Parent); 184*81ad6265SDimitry Andric std::size_t Consummed = 0; 185*81ad6265SDimitry Andric bool DoesStartWith = 186*81ad6265SDimitry Andric N.IsRoot || startsWith(Name, N.Name, Strict, Consummed, 187*81ad6265SDimitry Andric PreviousCharInName, PreviousCharInNeedle); 188*81ad6265SDimitry Andric if (!DoesStartWith) 189*81ad6265SDimitry Andric return std::make_tuple(N, false, 0); 190*81ad6265SDimitry Andric 191*81ad6265SDimitry Andric if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF) 192*81ad6265SDimitry Andric return std::make_tuple(N, true, N.Value); 193*81ad6265SDimitry Andric 194*81ad6265SDimitry Andric if (N.hasChildren()) { 195*81ad6265SDimitry Andric uint32_t ChildOffset = N.ChildrenOffset; 196*81ad6265SDimitry Andric for (;;) { 197*81ad6265SDimitry Andric Node C; 198*81ad6265SDimitry Andric bool Matches; 199*81ad6265SDimitry Andric uint32_t Value; 200*81ad6265SDimitry Andric std::tie(C, Matches, Value) = 201*81ad6265SDimitry Andric compareNode(ChildOffset, Name.substr(Consummed), Strict, 202*81ad6265SDimitry Andric PreviousCharInName, PreviousCharInNeedle, Buffer, &N); 203*81ad6265SDimitry Andric if (Matches) { 204*81ad6265SDimitry Andric std::reverse_copy(C.Name.begin(), C.Name.end(), 205*81ad6265SDimitry Andric std::back_inserter(Buffer)); 206*81ad6265SDimitry Andric return std::make_tuple(N, true, Value); 207*81ad6265SDimitry Andric } 208*81ad6265SDimitry Andric ChildOffset += C.Size; 209*81ad6265SDimitry Andric if (!C.HasSibling) 210*81ad6265SDimitry Andric break; 211*81ad6265SDimitry Andric } 212*81ad6265SDimitry Andric } 213*81ad6265SDimitry Andric return std::make_tuple(N, false, 0); 214*81ad6265SDimitry Andric } 215*81ad6265SDimitry Andric 216*81ad6265SDimitry Andric static std::tuple<Node, bool, uint32_t> 217*81ad6265SDimitry Andric compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) { 218*81ad6265SDimitry Andric return compareNode(Offset, Name, Strict, 0, 0, Buffer); 219*81ad6265SDimitry Andric } 220*81ad6265SDimitry Andric 221*81ad6265SDimitry Andric // clang-format off 222*81ad6265SDimitry Andric constexpr const char *const HangulSyllables[][3] = { 223*81ad6265SDimitry Andric { "G", "A", "" }, 224*81ad6265SDimitry Andric { "GG", "AE", "G" }, 225*81ad6265SDimitry Andric { "N", "YA", "GG" }, 226*81ad6265SDimitry Andric { "D", "YAE", "GS" }, 227*81ad6265SDimitry Andric { "DD", "EO", "N", }, 228*81ad6265SDimitry Andric { "R", "E", "NJ" }, 229*81ad6265SDimitry Andric { "M", "YEO", "NH" }, 230*81ad6265SDimitry Andric { "B", "YE", "D" }, 231*81ad6265SDimitry Andric { "BB", "O", "L" }, 232*81ad6265SDimitry Andric { "S", "WA", "LG" }, 233*81ad6265SDimitry Andric { "SS", "WAE", "LM" }, 234*81ad6265SDimitry Andric { "", "OE", "LB" }, 235*81ad6265SDimitry Andric { "J", "YO", "LS" }, 236*81ad6265SDimitry Andric { "JJ", "U", "LT" }, 237*81ad6265SDimitry Andric { "C", "WEO", "LP" }, 238*81ad6265SDimitry Andric { "K", "WE", "LH" }, 239*81ad6265SDimitry Andric { "T", "WI", "M" }, 240*81ad6265SDimitry Andric { "P", "YU", "B" }, 241*81ad6265SDimitry Andric { "H", "EU", "BS" }, 242*81ad6265SDimitry Andric { 0, "YI", "S" }, 243*81ad6265SDimitry Andric { 0, "I", "SS" }, 244*81ad6265SDimitry Andric { 0, 0, "NG" }, 245*81ad6265SDimitry Andric { 0, 0, "J" }, 246*81ad6265SDimitry Andric { 0, 0, "C" }, 247*81ad6265SDimitry Andric { 0, 0, "K" }, 248*81ad6265SDimitry Andric { 0, 0, "T" }, 249*81ad6265SDimitry Andric { 0, 0, "P" }, 250*81ad6265SDimitry Andric { 0, 0, "H" } 251*81ad6265SDimitry Andric }; 252*81ad6265SDimitry Andric // clang-format on 253*81ad6265SDimitry Andric 254*81ad6265SDimitry Andric // Unicode 14.0 255*81ad6265SDimitry Andric // 3.12 Conjoining Jamo Behavior Common constants 256*81ad6265SDimitry Andric constexpr const char32_t SBase = 0xAC00; 257*81ad6265SDimitry Andric constexpr const uint32_t LCount = 19; 258*81ad6265SDimitry Andric constexpr const uint32_t VCount = 21; 259*81ad6265SDimitry Andric constexpr const uint32_t TCount = 28; 260*81ad6265SDimitry Andric 261*81ad6265SDimitry Andric static std::size_t findSyllable(StringRef Name, bool Strict, 262*81ad6265SDimitry Andric char &PreviousInName, int &Pos, int Column) { 263*81ad6265SDimitry Andric assert(Column == 0 || Column == 1 || Column == 2); 264*81ad6265SDimitry Andric static std::size_t CountPerColumn[] = {LCount, VCount, TCount}; 265*81ad6265SDimitry Andric char NeedleStart = 0; 266*81ad6265SDimitry Andric int Len = -1; 267*81ad6265SDimitry Andric int Prev = PreviousInName; 268*81ad6265SDimitry Andric for (std::size_t I = 0; I < CountPerColumn[Column]; I++) { 269*81ad6265SDimitry Andric StringRef Syllable(HangulSyllables[I][Column]); 270*81ad6265SDimitry Andric if (int(Syllable.size()) <= Len) 271*81ad6265SDimitry Andric continue; 272*81ad6265SDimitry Andric std::size_t Consummed = 0; 273*81ad6265SDimitry Andric char PreviousInNameCopy = PreviousInName; 274*81ad6265SDimitry Andric bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed, 275*81ad6265SDimitry Andric PreviousInNameCopy, NeedleStart); 276*81ad6265SDimitry Andric if (!DoesStartWith) 277*81ad6265SDimitry Andric continue; 278*81ad6265SDimitry Andric Len = Consummed; 279*81ad6265SDimitry Andric Pos = I; 280*81ad6265SDimitry Andric Prev = PreviousInNameCopy; 281*81ad6265SDimitry Andric } 282*81ad6265SDimitry Andric if (Len == -1) 283*81ad6265SDimitry Andric return 0; 284*81ad6265SDimitry Andric PreviousInName = Prev; 285*81ad6265SDimitry Andric return size_t(Len); 286*81ad6265SDimitry Andric } 287*81ad6265SDimitry Andric 288*81ad6265SDimitry Andric static llvm::Optional<char32_t> 289*81ad6265SDimitry Andric nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { 290*81ad6265SDimitry Andric Buffer.clear(); 291*81ad6265SDimitry Andric // Hangul Syllable Decomposition 292*81ad6265SDimitry Andric std::size_t Consummed = 0; 293*81ad6265SDimitry Andric char NameStart = 0, NeedleStart = 0; 294*81ad6265SDimitry Andric bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, 295*81ad6265SDimitry Andric NameStart, NeedleStart); 296*81ad6265SDimitry Andric if (!DoesStartWith) 297*81ad6265SDimitry Andric return None; 298*81ad6265SDimitry Andric Name = Name.substr(Consummed); 299*81ad6265SDimitry Andric int L = -1, V = -1, T = -1; 300*81ad6265SDimitry Andric Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0)); 301*81ad6265SDimitry Andric Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1)); 302*81ad6265SDimitry Andric Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2)); 303*81ad6265SDimitry Andric if (L != -1 && V != -1 && T != -1 && Name.empty()) { 304*81ad6265SDimitry Andric if (!Strict) { 305*81ad6265SDimitry Andric Buffer.append("HANGUL SYLLABLE "); 306*81ad6265SDimitry Andric if (L != -1) 307*81ad6265SDimitry Andric Buffer.append(HangulSyllables[L][0]); 308*81ad6265SDimitry Andric if (V != -1) 309*81ad6265SDimitry Andric Buffer.append(HangulSyllables[V][1]); 310*81ad6265SDimitry Andric if (T != -1) 311*81ad6265SDimitry Andric Buffer.append(HangulSyllables[T][2]); 312*81ad6265SDimitry Andric } 313*81ad6265SDimitry Andric return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount + 314*81ad6265SDimitry Andric std::uint32_t(T); 315*81ad6265SDimitry Andric } 316*81ad6265SDimitry Andric // Otherwise, it's an illegal syllable name. 317*81ad6265SDimitry Andric return None; 318*81ad6265SDimitry Andric } 319*81ad6265SDimitry Andric 320*81ad6265SDimitry Andric struct GeneratedNamesData { 321*81ad6265SDimitry Andric StringRef Prefix; 322*81ad6265SDimitry Andric uint32_t Start; 323*81ad6265SDimitry Andric uint32_t End; 324*81ad6265SDimitry Andric }; 325*81ad6265SDimitry Andric 326*81ad6265SDimitry Andric // Unicode 14.0 Table 4-8. Name Derivation Rule Prefix Strings 327*81ad6265SDimitry Andric // This needs to be kept in sync with 328*81ad6265SDimitry Andric // llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp 329*81ad6265SDimitry Andric static const GeneratedNamesData GeneratedNamesDataTable[] = { 330*81ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF}, 331*81ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFC}, 332*81ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DD}, 333*81ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B734}, 334*81ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D}, 335*81ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1}, 336*81ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0}, 337*81ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A}, 338*81ad6265SDimitry Andric {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7}, 339*81ad6265SDimitry Andric {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08}, 340*81ad6265SDimitry Andric {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5}, 341*81ad6265SDimitry Andric {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB}, 342*81ad6265SDimitry Andric {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D}, 343*81ad6265SDimitry Andric {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9}, 344*81ad6265SDimitry Andric {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D}, 345*81ad6265SDimitry Andric }; 346*81ad6265SDimitry Andric 347*81ad6265SDimitry Andric static llvm::Optional<char32_t> 348*81ad6265SDimitry Andric nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { 349*81ad6265SDimitry Andric for (auto &&Item : GeneratedNamesDataTable) { 350*81ad6265SDimitry Andric Buffer.clear(); 351*81ad6265SDimitry Andric std::size_t Consummed = 0; 352*81ad6265SDimitry Andric char NameStart = 0, NeedleStart = 0; 353*81ad6265SDimitry Andric bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed, 354*81ad6265SDimitry Andric NameStart, NeedleStart, /*isPrefix*/ true); 355*81ad6265SDimitry Andric if (!DoesStartWith) 356*81ad6265SDimitry Andric continue; 357*81ad6265SDimitry Andric auto Number = Name.substr(Consummed); 358*81ad6265SDimitry Andric unsigned long long V = 0; 359*81ad6265SDimitry Andric // Be consistent about mandating upper casing. 360*81ad6265SDimitry Andric if (Strict && 361*81ad6265SDimitry Andric llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; })) 362*81ad6265SDimitry Andric return {}; 363*81ad6265SDimitry Andric if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End) 364*81ad6265SDimitry Andric continue; 365*81ad6265SDimitry Andric if (!Strict) { 366*81ad6265SDimitry Andric Buffer.append(Item.Prefix); 367*81ad6265SDimitry Andric Buffer.append(utohexstr(V, true)); 368*81ad6265SDimitry Andric } 369*81ad6265SDimitry Andric return V; 370*81ad6265SDimitry Andric } 371*81ad6265SDimitry Andric return None; 372*81ad6265SDimitry Andric } 373*81ad6265SDimitry Andric 374*81ad6265SDimitry Andric static llvm::Optional<char32_t> nameToCodepoint(StringRef Name, bool Strict, 375*81ad6265SDimitry Andric BufferType &Buffer) { 376*81ad6265SDimitry Andric if (Name.empty()) 377*81ad6265SDimitry Andric return None; 378*81ad6265SDimitry Andric 379*81ad6265SDimitry Andric llvm::Optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer); 380*81ad6265SDimitry Andric if (!Res) 381*81ad6265SDimitry Andric Res = nameToGeneratedCodePoint(Name, Strict, Buffer); 382*81ad6265SDimitry Andric if (Res) 383*81ad6265SDimitry Andric return *Res; 384*81ad6265SDimitry Andric 385*81ad6265SDimitry Andric Buffer.clear(); 386*81ad6265SDimitry Andric Node Node; 387*81ad6265SDimitry Andric bool Matches; 388*81ad6265SDimitry Andric uint32_t Value; 389*81ad6265SDimitry Andric std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer); 390*81ad6265SDimitry Andric if (Matches) { 391*81ad6265SDimitry Andric std::reverse(Buffer.begin(), Buffer.end()); 392*81ad6265SDimitry Andric // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial 393*81ad6265SDimitry Andric // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E. 394*81ad6265SDimitry Andric if (!Strict && Value == 0x116c && 395*81ad6265SDimitry Andric Name.find_insensitive("O-E") != StringRef::npos) { 396*81ad6265SDimitry Andric Buffer = "HANGUL JUNGSEONG O-E"; 397*81ad6265SDimitry Andric Value = 0x1180; 398*81ad6265SDimitry Andric } 399*81ad6265SDimitry Andric return Value; 400*81ad6265SDimitry Andric } 401*81ad6265SDimitry Andric return None; 402*81ad6265SDimitry Andric } 403*81ad6265SDimitry Andric 404*81ad6265SDimitry Andric llvm::Optional<char32_t> nameToCodepointStrict(StringRef Name) { 405*81ad6265SDimitry Andric 406*81ad6265SDimitry Andric BufferType Buffer; 407*81ad6265SDimitry Andric auto Opt = nameToCodepoint(Name, true, Buffer); 408*81ad6265SDimitry Andric return Opt; 409*81ad6265SDimitry Andric } 410*81ad6265SDimitry Andric 411*81ad6265SDimitry Andric llvm::Optional<LooseMatchingResult> 412*81ad6265SDimitry Andric nameToCodepointLooseMatching(StringRef Name) { 413*81ad6265SDimitry Andric BufferType Buffer; 414*81ad6265SDimitry Andric auto Opt = nameToCodepoint(Name, false, Buffer); 415*81ad6265SDimitry Andric if (!Opt) 416*81ad6265SDimitry Andric return None; 417*81ad6265SDimitry Andric return LooseMatchingResult{*Opt, Buffer}; 418*81ad6265SDimitry Andric } 419*81ad6265SDimitry Andric 420*81ad6265SDimitry Andric // Find the unicode character whose editing distance to Pattern 421*81ad6265SDimitry Andric // is shortest, using the Wagner–Fischer algorithm. 422*81ad6265SDimitry Andric llvm::SmallVector<MatchForCodepointName> 423*81ad6265SDimitry Andric nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) { 424*81ad6265SDimitry Andric // We maintain a fixed size vector of matches, 425*81ad6265SDimitry Andric // sorted by distance 426*81ad6265SDimitry Andric // The worst match (with the biggest distance) are discarded when new elements 427*81ad6265SDimitry Andric // are added. 428*81ad6265SDimitry Andric std::size_t LargestEditDistance = 0; 429*81ad6265SDimitry Andric llvm::SmallVector<MatchForCodepointName> Matches; 430*81ad6265SDimitry Andric Matches.reserve(MaxMatchesCount + 1); 431*81ad6265SDimitry Andric 432*81ad6265SDimitry Andric auto Insert = [&](const Node &Node, uint32_t Distance, 433*81ad6265SDimitry Andric char32_t Value) -> bool { 434*81ad6265SDimitry Andric if (Distance > LargestEditDistance) { 435*81ad6265SDimitry Andric if (Matches.size() == MaxMatchesCount) 436*81ad6265SDimitry Andric return false; 437*81ad6265SDimitry Andric LargestEditDistance = Distance; 438*81ad6265SDimitry Andric } 439*81ad6265SDimitry Andric // To avoid allocations, the creation of the name is delayed 440*81ad6265SDimitry Andric // as much as possible. 441*81ad6265SDimitry Andric std::string Name; 442*81ad6265SDimitry Andric auto GetName = [&] { 443*81ad6265SDimitry Andric if (Name.empty()) 444*81ad6265SDimitry Andric Name = Node.fullName(); 445*81ad6265SDimitry Andric return Name; 446*81ad6265SDimitry Andric }; 447*81ad6265SDimitry Andric 448*81ad6265SDimitry Andric auto It = std::lower_bound( 449*81ad6265SDimitry Andric Matches.begin(), Matches.end(), Distance, 450*81ad6265SDimitry Andric [&](const MatchForCodepointName &a, std::size_t Distance) { 451*81ad6265SDimitry Andric if (Distance == a.Distance) 452*81ad6265SDimitry Andric return a.Name < GetName(); 453*81ad6265SDimitry Andric return a.Distance < Distance; 454*81ad6265SDimitry Andric }); 455*81ad6265SDimitry Andric if (It == Matches.end() && Matches.size() == MaxMatchesCount) 456*81ad6265SDimitry Andric return false; 457*81ad6265SDimitry Andric 458*81ad6265SDimitry Andric MatchForCodepointName M{GetName(), Distance, Value}; 459*81ad6265SDimitry Andric Matches.insert(It, std::move(M)); 460*81ad6265SDimitry Andric if (Matches.size() > MaxMatchesCount) 461*81ad6265SDimitry Andric Matches.pop_back(); 462*81ad6265SDimitry Andric return true; 463*81ad6265SDimitry Andric }; 464*81ad6265SDimitry Andric 465*81ad6265SDimitry Andric // We ignore case, space, hyphens, etc, 466*81ad6265SDimitry Andric // in both the search pattern and the prospective names. 467*81ad6265SDimitry Andric auto Normalize = [](StringRef Name) { 468*81ad6265SDimitry Andric std::string Out; 469*81ad6265SDimitry Andric Out.reserve(Name.size()); 470*81ad6265SDimitry Andric for (char C : Name) { 471*81ad6265SDimitry Andric if (isAlnum(C)) 472*81ad6265SDimitry Andric Out.push_back(toUpper(C)); 473*81ad6265SDimitry Andric } 474*81ad6265SDimitry Andric return Out; 475*81ad6265SDimitry Andric }; 476*81ad6265SDimitry Andric std::string NormalizedName = Normalize(Pattern); 477*81ad6265SDimitry Andric 478*81ad6265SDimitry Andric // Allocate a matrix big enough for longest names. 479*81ad6265SDimitry Andric const std::size_t Columns = 480*81ad6265SDimitry Andric std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) + 481*81ad6265SDimitry Andric 1; 482*81ad6265SDimitry Andric 483*81ad6265SDimitry Andric LLVM_ATTRIBUTE_UNUSED static std::size_t Rows = 484*81ad6265SDimitry Andric UnicodeNameToCodepointLargestNameSize + 1; 485*81ad6265SDimitry Andric 486*81ad6265SDimitry Andric std::vector<char> Distances( 487*81ad6265SDimitry Andric Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0); 488*81ad6265SDimitry Andric 489*81ad6265SDimitry Andric auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & { 490*81ad6265SDimitry Andric assert(Column < Columns); 491*81ad6265SDimitry Andric assert(Row < Rows); 492*81ad6265SDimitry Andric return Distances[Row * Columns + Column]; 493*81ad6265SDimitry Andric }; 494*81ad6265SDimitry Andric 495*81ad6265SDimitry Andric for (std::size_t I = 0; I < Columns; I++) 496*81ad6265SDimitry Andric Get(I, 0) = I; 497*81ad6265SDimitry Andric 498*81ad6265SDimitry Andric // Visit the childrens, 499*81ad6265SDimitry Andric // Filling (and overriding) the matrix for the name fragment of each node 500*81ad6265SDimitry Andric // iteratively. CompleteName is used to collect the actual name of potential 501*81ad6265SDimitry Andric // match, respecting case and spacing. 502*81ad6265SDimitry Andric auto VisitNode = [&](const Node &N, std::size_t Row, 503*81ad6265SDimitry Andric auto &VisitNode) -> void { 504*81ad6265SDimitry Andric std::size_t J = 0; 505*81ad6265SDimitry Andric for (; J < N.Name.size(); J++) { 506*81ad6265SDimitry Andric if (!isAlnum(N.Name[J])) 507*81ad6265SDimitry Andric continue; 508*81ad6265SDimitry Andric 509*81ad6265SDimitry Andric Get(0, Row) = Row; 510*81ad6265SDimitry Andric 511*81ad6265SDimitry Andric for (std::size_t I = 1; I < Columns; I++) { 512*81ad6265SDimitry Andric const int Delete = Get(I - 1, Row) + 1; 513*81ad6265SDimitry Andric const int Insert = Get(I, Row - 1) + 1; 514*81ad6265SDimitry Andric 515*81ad6265SDimitry Andric const int Replace = 516*81ad6265SDimitry Andric Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0); 517*81ad6265SDimitry Andric 518*81ad6265SDimitry Andric Get(I, Row) = std::min(Insert, std::min(Delete, Replace)); 519*81ad6265SDimitry Andric } 520*81ad6265SDimitry Andric 521*81ad6265SDimitry Andric Row++; 522*81ad6265SDimitry Andric } 523*81ad6265SDimitry Andric 524*81ad6265SDimitry Andric unsigned Cost = Get(Columns - 1, Row - 1); 525*81ad6265SDimitry Andric if (N.Value != 0xFFFFFFFF) { 526*81ad6265SDimitry Andric Insert(N, Cost, N.Value); 527*81ad6265SDimitry Andric } 528*81ad6265SDimitry Andric 529*81ad6265SDimitry Andric if (N.hasChildren()) { 530*81ad6265SDimitry Andric auto ChildOffset = N.ChildrenOffset; 531*81ad6265SDimitry Andric for (;;) { 532*81ad6265SDimitry Andric Node C = readNode(ChildOffset, &N); 533*81ad6265SDimitry Andric ChildOffset += C.Size; 534*81ad6265SDimitry Andric if (!C.isValid()) 535*81ad6265SDimitry Andric break; 536*81ad6265SDimitry Andric VisitNode(C, Row, VisitNode); 537*81ad6265SDimitry Andric if (!C.HasSibling) 538*81ad6265SDimitry Andric break; 539*81ad6265SDimitry Andric } 540*81ad6265SDimitry Andric } 541*81ad6265SDimitry Andric }; 542*81ad6265SDimitry Andric 543*81ad6265SDimitry Andric Node Root = createRoot(); 544*81ad6265SDimitry Andric VisitNode(Root, 1, VisitNode); 545*81ad6265SDimitry Andric return Matches; 546*81ad6265SDimitry Andric } 547*81ad6265SDimitry Andric 548*81ad6265SDimitry Andric } // namespace unicode 549*81ad6265SDimitry Andric 550*81ad6265SDimitry Andric } // namespace sys 551*81ad6265SDimitry Andric } // namespace llvm 552