xref: /freebsd-src/contrib/llvm-project/llvm/lib/Support/UnicodeNameToCodepoint.cpp (revision 81ad626541db97eb356e2c1d4a20eb2a26a766ab)
1*81ad6265SDimitry Andric //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
2*81ad6265SDimitry Andric //-*- C++ -*-===//
3*81ad6265SDimitry Andric //
4*81ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5*81ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
6*81ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7*81ad6265SDimitry Andric //
8*81ad6265SDimitry Andric //===----------------------------------------------------------------------===//
9*81ad6265SDimitry Andric //
10*81ad6265SDimitry Andric // This file implements functions to map the name or alias of a unicode
11*81ad6265SDimitry Andric // character to its codepoint.
12*81ad6265SDimitry Andric //
13*81ad6265SDimitry Andric //===----------------------------------------------------------------------===//
14*81ad6265SDimitry Andric 
15*81ad6265SDimitry Andric #include "llvm/ADT/STLExtras.h"
16*81ad6265SDimitry Andric #include "llvm/ADT/StringExtras.h"
17*81ad6265SDimitry Andric #include "llvm/ADT/StringRef.h"
18*81ad6265SDimitry Andric #include "llvm/Support/Unicode.h"
19*81ad6265SDimitry Andric 
20*81ad6265SDimitry Andric namespace llvm {
21*81ad6265SDimitry Andric namespace sys {
22*81ad6265SDimitry Andric namespace unicode {
23*81ad6265SDimitry Andric 
24*81ad6265SDimitry Andric extern const char *UnicodeNameToCodepointDict;
25*81ad6265SDimitry Andric extern const uint8_t *UnicodeNameToCodepointIndex;
26*81ad6265SDimitry Andric extern const std::size_t UnicodeNameToCodepointIndexSize;
27*81ad6265SDimitry Andric extern const std::size_t UnicodeNameToCodepointLargestNameSize;
28*81ad6265SDimitry Andric 
29*81ad6265SDimitry Andric using BufferType = SmallString<64>;
30*81ad6265SDimitry Andric 
31*81ad6265SDimitry Andric struct Node {
32*81ad6265SDimitry Andric   bool IsRoot = false;
33*81ad6265SDimitry Andric   char32_t Value = 0xFFFFFFFF;
34*81ad6265SDimitry Andric   uint32_t ChildrenOffset = 0;
35*81ad6265SDimitry Andric   bool HasSibling = false;
36*81ad6265SDimitry Andric   uint32_t Size = 0;
37*81ad6265SDimitry Andric   StringRef Name;
38*81ad6265SDimitry Andric   const Node *Parent = nullptr;
39*81ad6265SDimitry Andric 
40*81ad6265SDimitry Andric   constexpr bool isValid() const {
41*81ad6265SDimitry Andric     return !Name.empty() || Value == 0xFFFFFFFF;
42*81ad6265SDimitry Andric   }
43*81ad6265SDimitry Andric   constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
44*81ad6265SDimitry Andric 
45*81ad6265SDimitry Andric   std::string fullName() const {
46*81ad6265SDimitry Andric     std::string S;
47*81ad6265SDimitry Andric     // Reserve enough space for most unicode code points.
48*81ad6265SDimitry Andric     // The chosen value represent the 99th percentile of name size as of
49*81ad6265SDimitry Andric     // Unicode 14.
50*81ad6265SDimitry Andric     S.reserve(46);
51*81ad6265SDimitry Andric     const Node *N = this;
52*81ad6265SDimitry Andric     while (N) {
53*81ad6265SDimitry Andric       std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
54*81ad6265SDimitry Andric       N = N->Parent;
55*81ad6265SDimitry Andric     }
56*81ad6265SDimitry Andric     std::reverse(S.begin(), S.end());
57*81ad6265SDimitry Andric     return S;
58*81ad6265SDimitry Andric   }
59*81ad6265SDimitry Andric };
60*81ad6265SDimitry Andric 
61*81ad6265SDimitry Andric static Node createRoot() {
62*81ad6265SDimitry Andric   Node N;
63*81ad6265SDimitry Andric   N.IsRoot = true;
64*81ad6265SDimitry Andric   N.ChildrenOffset = 1;
65*81ad6265SDimitry Andric   N.Size = 1;
66*81ad6265SDimitry Andric   return N;
67*81ad6265SDimitry Andric }
68*81ad6265SDimitry Andric 
69*81ad6265SDimitry Andric static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
70*81ad6265SDimitry Andric   if (Offset == 0)
71*81ad6265SDimitry Andric     return createRoot();
72*81ad6265SDimitry Andric 
73*81ad6265SDimitry Andric   uint32_t Origin = Offset;
74*81ad6265SDimitry Andric   Node N;
75*81ad6265SDimitry Andric   N.Parent = Parent;
76*81ad6265SDimitry Andric   uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
77*81ad6265SDimitry Andric   if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
78*81ad6265SDimitry Andric     return N;
79*81ad6265SDimitry Andric 
80*81ad6265SDimitry Andric   bool LongName = NameInfo & 0x40;
81*81ad6265SDimitry Andric   bool HasValue = NameInfo & 0x80;
82*81ad6265SDimitry Andric   std::size_t Size = NameInfo & ~0xC0;
83*81ad6265SDimitry Andric   if (LongName) {
84*81ad6265SDimitry Andric     uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
85*81ad6265SDimitry Andric     NameOffset |= UnicodeNameToCodepointIndex[Offset++];
86*81ad6265SDimitry Andric     N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
87*81ad6265SDimitry Andric   } else {
88*81ad6265SDimitry Andric     N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
89*81ad6265SDimitry Andric   }
90*81ad6265SDimitry Andric   if (HasValue) {
91*81ad6265SDimitry Andric     uint8_t H = UnicodeNameToCodepointIndex[Offset++];
92*81ad6265SDimitry Andric     uint8_t M = UnicodeNameToCodepointIndex[Offset++];
93*81ad6265SDimitry Andric     uint8_t L = UnicodeNameToCodepointIndex[Offset++];
94*81ad6265SDimitry Andric     N.Value = ((H << 16) | (M << 8) | L) >> 3;
95*81ad6265SDimitry Andric 
96*81ad6265SDimitry Andric     bool HasChildren = L & 0x02;
97*81ad6265SDimitry Andric     N.HasSibling = L & 0x01;
98*81ad6265SDimitry Andric 
99*81ad6265SDimitry Andric     if (HasChildren) {
100*81ad6265SDimitry Andric       N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
101*81ad6265SDimitry Andric       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
102*81ad6265SDimitry Andric       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
103*81ad6265SDimitry Andric     }
104*81ad6265SDimitry Andric   } else {
105*81ad6265SDimitry Andric     uint8_t H = UnicodeNameToCodepointIndex[Offset++];
106*81ad6265SDimitry Andric     N.HasSibling = H & 0x80;
107*81ad6265SDimitry Andric     bool HasChildren = H & 0x40;
108*81ad6265SDimitry Andric     H &= ~0xC0;
109*81ad6265SDimitry Andric     if (HasChildren) {
110*81ad6265SDimitry Andric       N.ChildrenOffset = (H << 16);
111*81ad6265SDimitry Andric       N.ChildrenOffset |=
112*81ad6265SDimitry Andric           (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
113*81ad6265SDimitry Andric       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
114*81ad6265SDimitry Andric     }
115*81ad6265SDimitry Andric   }
116*81ad6265SDimitry Andric   N.Size = Offset - Origin;
117*81ad6265SDimitry Andric   return N;
118*81ad6265SDimitry Andric }
119*81ad6265SDimitry Andric 
120*81ad6265SDimitry Andric static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
121*81ad6265SDimitry Andric                        std::size_t &Consummed, char &PreviousCharInName,
122*81ad6265SDimitry Andric                        char &PreviousCharInNeedle, bool IsPrefix = false) {
123*81ad6265SDimitry Andric 
124*81ad6265SDimitry Andric   Consummed = 0;
125*81ad6265SDimitry Andric   if (Strict) {
126*81ad6265SDimitry Andric     if (!Name.startswith(Needle))
127*81ad6265SDimitry Andric       return false;
128*81ad6265SDimitry Andric     Consummed = Needle.size();
129*81ad6265SDimitry Andric     return true;
130*81ad6265SDimitry Andric   }
131*81ad6265SDimitry Andric   if (Needle.empty())
132*81ad6265SDimitry Andric     return true;
133*81ad6265SDimitry Andric 
134*81ad6265SDimitry Andric   auto NamePos = Name.begin();
135*81ad6265SDimitry Andric   auto NeedlePos = Needle.begin();
136*81ad6265SDimitry Andric 
137*81ad6265SDimitry Andric   char PreviousCharInNameOrigin = PreviousCharInName;
138*81ad6265SDimitry Andric   char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
139*81ad6265SDimitry Andric 
140*81ad6265SDimitry Andric   auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
141*81ad6265SDimitry Andric                          bool IgnoreEnd = false) {
142*81ad6265SDimitry Andric     while (It != End) {
143*81ad6265SDimitry Andric       const auto Next = std::next(It);
144*81ad6265SDimitry Andric       // Ignore spaces, underscore, medial hyphens
145*81ad6265SDimitry Andric       // https://unicode.org/reports/tr44/#UAX44-LM2.
146*81ad6265SDimitry Andric       bool Ignore =
147*81ad6265SDimitry Andric           *It == ' ' || *It == '_' ||
148*81ad6265SDimitry Andric           (*It == '-' && isAlnum(PreviousChar) &&
149*81ad6265SDimitry Andric            ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
150*81ad6265SDimitry Andric       PreviousChar = *It;
151*81ad6265SDimitry Andric       if (!Ignore)
152*81ad6265SDimitry Andric         break;
153*81ad6265SDimitry Andric       ++It;
154*81ad6265SDimitry Andric     }
155*81ad6265SDimitry Andric     return It;
156*81ad6265SDimitry Andric   };
157*81ad6265SDimitry Andric 
158*81ad6265SDimitry Andric   while (true) {
159*81ad6265SDimitry Andric     NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
160*81ad6265SDimitry Andric     NeedlePos =
161*81ad6265SDimitry Andric         IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
162*81ad6265SDimitry Andric     if (NeedlePos == Needle.end())
163*81ad6265SDimitry Andric       break;
164*81ad6265SDimitry Andric     if (NamePos == Name.end())
165*81ad6265SDimitry Andric       break;
166*81ad6265SDimitry Andric     if (toUpper(*NeedlePos) != toUpper(*NamePos))
167*81ad6265SDimitry Andric       break;
168*81ad6265SDimitry Andric     NeedlePos++;
169*81ad6265SDimitry Andric     NamePos++;
170*81ad6265SDimitry Andric   }
171*81ad6265SDimitry Andric   Consummed = std::distance(Name.begin(), NamePos);
172*81ad6265SDimitry Andric   if (NeedlePos != Needle.end()) {
173*81ad6265SDimitry Andric     PreviousCharInName = PreviousCharInNameOrigin;
174*81ad6265SDimitry Andric     PreviousCharInNeedle = PreviousCharInNeedleOrigin;
175*81ad6265SDimitry Andric   }
176*81ad6265SDimitry Andric   return NeedlePos == Needle.end();
177*81ad6265SDimitry Andric }
178*81ad6265SDimitry Andric 
179*81ad6265SDimitry Andric static std::tuple<Node, bool, uint32_t>
180*81ad6265SDimitry Andric compareNode(uint32_t Offset, StringRef Name, bool Strict,
181*81ad6265SDimitry Andric             char PreviousCharInName, char PreviousCharInNeedle,
182*81ad6265SDimitry Andric             BufferType &Buffer, const Node *Parent = nullptr) {
183*81ad6265SDimitry Andric   Node N = readNode(Offset, Parent);
184*81ad6265SDimitry Andric   std::size_t Consummed = 0;
185*81ad6265SDimitry Andric   bool DoesStartWith =
186*81ad6265SDimitry Andric       N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
187*81ad6265SDimitry Andric                              PreviousCharInName, PreviousCharInNeedle);
188*81ad6265SDimitry Andric   if (!DoesStartWith)
189*81ad6265SDimitry Andric     return std::make_tuple(N, false, 0);
190*81ad6265SDimitry Andric 
191*81ad6265SDimitry Andric   if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
192*81ad6265SDimitry Andric     return std::make_tuple(N, true, N.Value);
193*81ad6265SDimitry Andric 
194*81ad6265SDimitry Andric   if (N.hasChildren()) {
195*81ad6265SDimitry Andric     uint32_t ChildOffset = N.ChildrenOffset;
196*81ad6265SDimitry Andric     for (;;) {
197*81ad6265SDimitry Andric       Node C;
198*81ad6265SDimitry Andric       bool Matches;
199*81ad6265SDimitry Andric       uint32_t Value;
200*81ad6265SDimitry Andric       std::tie(C, Matches, Value) =
201*81ad6265SDimitry Andric           compareNode(ChildOffset, Name.substr(Consummed), Strict,
202*81ad6265SDimitry Andric                       PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
203*81ad6265SDimitry Andric       if (Matches) {
204*81ad6265SDimitry Andric         std::reverse_copy(C.Name.begin(), C.Name.end(),
205*81ad6265SDimitry Andric                           std::back_inserter(Buffer));
206*81ad6265SDimitry Andric         return std::make_tuple(N, true, Value);
207*81ad6265SDimitry Andric       }
208*81ad6265SDimitry Andric       ChildOffset += C.Size;
209*81ad6265SDimitry Andric       if (!C.HasSibling)
210*81ad6265SDimitry Andric         break;
211*81ad6265SDimitry Andric     }
212*81ad6265SDimitry Andric   }
213*81ad6265SDimitry Andric   return std::make_tuple(N, false, 0);
214*81ad6265SDimitry Andric }
215*81ad6265SDimitry Andric 
216*81ad6265SDimitry Andric static std::tuple<Node, bool, uint32_t>
217*81ad6265SDimitry Andric compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
218*81ad6265SDimitry Andric   return compareNode(Offset, Name, Strict, 0, 0, Buffer);
219*81ad6265SDimitry Andric }
220*81ad6265SDimitry Andric 
221*81ad6265SDimitry Andric // clang-format off
222*81ad6265SDimitry Andric constexpr const char *const HangulSyllables[][3] = {
223*81ad6265SDimitry Andric     { "G",  "A",   ""   },
224*81ad6265SDimitry Andric     { "GG", "AE",  "G"  },
225*81ad6265SDimitry Andric     { "N",  "YA",  "GG" },
226*81ad6265SDimitry Andric     { "D",  "YAE", "GS" },
227*81ad6265SDimitry Andric     { "DD", "EO",  "N", },
228*81ad6265SDimitry Andric     { "R",  "E",   "NJ" },
229*81ad6265SDimitry Andric     { "M",  "YEO", "NH" },
230*81ad6265SDimitry Andric     { "B",  "YE",  "D"  },
231*81ad6265SDimitry Andric     { "BB", "O",   "L"  },
232*81ad6265SDimitry Andric     { "S",  "WA",  "LG" },
233*81ad6265SDimitry Andric     { "SS", "WAE", "LM" },
234*81ad6265SDimitry Andric     { "",   "OE",  "LB" },
235*81ad6265SDimitry Andric     { "J",  "YO",  "LS" },
236*81ad6265SDimitry Andric     { "JJ", "U",   "LT" },
237*81ad6265SDimitry Andric     { "C",  "WEO", "LP" },
238*81ad6265SDimitry Andric     { "K",  "WE",  "LH" },
239*81ad6265SDimitry Andric     { "T",  "WI",  "M"  },
240*81ad6265SDimitry Andric     { "P",  "YU",  "B"  },
241*81ad6265SDimitry Andric     { "H",  "EU",  "BS" },
242*81ad6265SDimitry Andric     { 0,    "YI",  "S"  },
243*81ad6265SDimitry Andric     { 0,    "I",   "SS" },
244*81ad6265SDimitry Andric     { 0,    0,     "NG" },
245*81ad6265SDimitry Andric     { 0,    0,     "J"  },
246*81ad6265SDimitry Andric     { 0,    0,     "C"  },
247*81ad6265SDimitry Andric     { 0,    0,     "K"  },
248*81ad6265SDimitry Andric     { 0,    0,     "T"  },
249*81ad6265SDimitry Andric     { 0,    0,     "P"  },
250*81ad6265SDimitry Andric     { 0,    0,     "H"  }
251*81ad6265SDimitry Andric     };
252*81ad6265SDimitry Andric // clang-format on
253*81ad6265SDimitry Andric 
254*81ad6265SDimitry Andric // Unicode 14.0
255*81ad6265SDimitry Andric // 3.12 Conjoining Jamo Behavior Common constants
256*81ad6265SDimitry Andric constexpr const char32_t SBase = 0xAC00;
257*81ad6265SDimitry Andric constexpr const uint32_t LCount = 19;
258*81ad6265SDimitry Andric constexpr const uint32_t VCount = 21;
259*81ad6265SDimitry Andric constexpr const uint32_t TCount = 28;
260*81ad6265SDimitry Andric 
261*81ad6265SDimitry Andric static std::size_t findSyllable(StringRef Name, bool Strict,
262*81ad6265SDimitry Andric                                 char &PreviousInName, int &Pos, int Column) {
263*81ad6265SDimitry Andric   assert(Column == 0 || Column == 1 || Column == 2);
264*81ad6265SDimitry Andric   static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
265*81ad6265SDimitry Andric   char NeedleStart = 0;
266*81ad6265SDimitry Andric   int Len = -1;
267*81ad6265SDimitry Andric   int Prev = PreviousInName;
268*81ad6265SDimitry Andric   for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
269*81ad6265SDimitry Andric     StringRef Syllable(HangulSyllables[I][Column]);
270*81ad6265SDimitry Andric     if (int(Syllable.size()) <= Len)
271*81ad6265SDimitry Andric       continue;
272*81ad6265SDimitry Andric     std::size_t Consummed = 0;
273*81ad6265SDimitry Andric     char PreviousInNameCopy = PreviousInName;
274*81ad6265SDimitry Andric     bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
275*81ad6265SDimitry Andric                                     PreviousInNameCopy, NeedleStart);
276*81ad6265SDimitry Andric     if (!DoesStartWith)
277*81ad6265SDimitry Andric       continue;
278*81ad6265SDimitry Andric     Len = Consummed;
279*81ad6265SDimitry Andric     Pos = I;
280*81ad6265SDimitry Andric     Prev = PreviousInNameCopy;
281*81ad6265SDimitry Andric   }
282*81ad6265SDimitry Andric   if (Len == -1)
283*81ad6265SDimitry Andric     return 0;
284*81ad6265SDimitry Andric   PreviousInName = Prev;
285*81ad6265SDimitry Andric   return size_t(Len);
286*81ad6265SDimitry Andric }
287*81ad6265SDimitry Andric 
288*81ad6265SDimitry Andric static llvm::Optional<char32_t>
289*81ad6265SDimitry Andric nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
290*81ad6265SDimitry Andric   Buffer.clear();
291*81ad6265SDimitry Andric   // Hangul Syllable Decomposition
292*81ad6265SDimitry Andric   std::size_t Consummed = 0;
293*81ad6265SDimitry Andric   char NameStart = 0, NeedleStart = 0;
294*81ad6265SDimitry Andric   bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
295*81ad6265SDimitry Andric                                   NameStart, NeedleStart);
296*81ad6265SDimitry Andric   if (!DoesStartWith)
297*81ad6265SDimitry Andric     return None;
298*81ad6265SDimitry Andric   Name = Name.substr(Consummed);
299*81ad6265SDimitry Andric   int L = -1, V = -1, T = -1;
300*81ad6265SDimitry Andric   Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
301*81ad6265SDimitry Andric   Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
302*81ad6265SDimitry Andric   Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
303*81ad6265SDimitry Andric   if (L != -1 && V != -1 && T != -1 && Name.empty()) {
304*81ad6265SDimitry Andric     if (!Strict) {
305*81ad6265SDimitry Andric       Buffer.append("HANGUL SYLLABLE ");
306*81ad6265SDimitry Andric       if (L != -1)
307*81ad6265SDimitry Andric         Buffer.append(HangulSyllables[L][0]);
308*81ad6265SDimitry Andric       if (V != -1)
309*81ad6265SDimitry Andric         Buffer.append(HangulSyllables[V][1]);
310*81ad6265SDimitry Andric       if (T != -1)
311*81ad6265SDimitry Andric         Buffer.append(HangulSyllables[T][2]);
312*81ad6265SDimitry Andric     }
313*81ad6265SDimitry Andric     return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
314*81ad6265SDimitry Andric            std::uint32_t(T);
315*81ad6265SDimitry Andric   }
316*81ad6265SDimitry Andric   // Otherwise, it's an illegal syllable name.
317*81ad6265SDimitry Andric   return None;
318*81ad6265SDimitry Andric }
319*81ad6265SDimitry Andric 
320*81ad6265SDimitry Andric struct GeneratedNamesData {
321*81ad6265SDimitry Andric   StringRef Prefix;
322*81ad6265SDimitry Andric   uint32_t Start;
323*81ad6265SDimitry Andric   uint32_t End;
324*81ad6265SDimitry Andric };
325*81ad6265SDimitry Andric 
326*81ad6265SDimitry Andric // Unicode 14.0 Table 4-8. Name Derivation Rule Prefix Strings
327*81ad6265SDimitry Andric // This needs to be kept in sync with
328*81ad6265SDimitry Andric // llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
329*81ad6265SDimitry Andric static const GeneratedNamesData GeneratedNamesDataTable[] = {
330*81ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
331*81ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFC},
332*81ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DD},
333*81ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B734},
334*81ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
335*81ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
336*81ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
337*81ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
338*81ad6265SDimitry Andric     {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
339*81ad6265SDimitry Andric     {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
340*81ad6265SDimitry Andric     {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
341*81ad6265SDimitry Andric     {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
342*81ad6265SDimitry Andric     {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
343*81ad6265SDimitry Andric     {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
344*81ad6265SDimitry Andric     {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
345*81ad6265SDimitry Andric };
346*81ad6265SDimitry Andric 
347*81ad6265SDimitry Andric static llvm::Optional<char32_t>
348*81ad6265SDimitry Andric nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
349*81ad6265SDimitry Andric   for (auto &&Item : GeneratedNamesDataTable) {
350*81ad6265SDimitry Andric     Buffer.clear();
351*81ad6265SDimitry Andric     std::size_t Consummed = 0;
352*81ad6265SDimitry Andric     char NameStart = 0, NeedleStart = 0;
353*81ad6265SDimitry Andric     bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
354*81ad6265SDimitry Andric                                     NameStart, NeedleStart, /*isPrefix*/ true);
355*81ad6265SDimitry Andric     if (!DoesStartWith)
356*81ad6265SDimitry Andric       continue;
357*81ad6265SDimitry Andric     auto Number = Name.substr(Consummed);
358*81ad6265SDimitry Andric     unsigned long long V = 0;
359*81ad6265SDimitry Andric     // Be consistent about mandating upper casing.
360*81ad6265SDimitry Andric     if (Strict &&
361*81ad6265SDimitry Andric         llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
362*81ad6265SDimitry Andric       return {};
363*81ad6265SDimitry Andric     if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
364*81ad6265SDimitry Andric       continue;
365*81ad6265SDimitry Andric     if (!Strict) {
366*81ad6265SDimitry Andric       Buffer.append(Item.Prefix);
367*81ad6265SDimitry Andric       Buffer.append(utohexstr(V, true));
368*81ad6265SDimitry Andric     }
369*81ad6265SDimitry Andric     return V;
370*81ad6265SDimitry Andric   }
371*81ad6265SDimitry Andric   return None;
372*81ad6265SDimitry Andric }
373*81ad6265SDimitry Andric 
374*81ad6265SDimitry Andric static llvm::Optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
375*81ad6265SDimitry Andric                                                 BufferType &Buffer) {
376*81ad6265SDimitry Andric   if (Name.empty())
377*81ad6265SDimitry Andric     return None;
378*81ad6265SDimitry Andric 
379*81ad6265SDimitry Andric   llvm::Optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
380*81ad6265SDimitry Andric   if (!Res)
381*81ad6265SDimitry Andric     Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
382*81ad6265SDimitry Andric   if (Res)
383*81ad6265SDimitry Andric     return *Res;
384*81ad6265SDimitry Andric 
385*81ad6265SDimitry Andric   Buffer.clear();
386*81ad6265SDimitry Andric   Node Node;
387*81ad6265SDimitry Andric   bool Matches;
388*81ad6265SDimitry Andric   uint32_t Value;
389*81ad6265SDimitry Andric   std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
390*81ad6265SDimitry Andric   if (Matches) {
391*81ad6265SDimitry Andric     std::reverse(Buffer.begin(), Buffer.end());
392*81ad6265SDimitry Andric     // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
393*81ad6265SDimitry Andric     // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
394*81ad6265SDimitry Andric     if (!Strict && Value == 0x116c &&
395*81ad6265SDimitry Andric         Name.find_insensitive("O-E") != StringRef::npos) {
396*81ad6265SDimitry Andric       Buffer = "HANGUL JUNGSEONG O-E";
397*81ad6265SDimitry Andric       Value = 0x1180;
398*81ad6265SDimitry Andric     }
399*81ad6265SDimitry Andric     return Value;
400*81ad6265SDimitry Andric   }
401*81ad6265SDimitry Andric   return None;
402*81ad6265SDimitry Andric }
403*81ad6265SDimitry Andric 
404*81ad6265SDimitry Andric llvm::Optional<char32_t> nameToCodepointStrict(StringRef Name) {
405*81ad6265SDimitry Andric 
406*81ad6265SDimitry Andric   BufferType Buffer;
407*81ad6265SDimitry Andric   auto Opt = nameToCodepoint(Name, true, Buffer);
408*81ad6265SDimitry Andric   return Opt;
409*81ad6265SDimitry Andric }
410*81ad6265SDimitry Andric 
411*81ad6265SDimitry Andric llvm::Optional<LooseMatchingResult>
412*81ad6265SDimitry Andric nameToCodepointLooseMatching(StringRef Name) {
413*81ad6265SDimitry Andric   BufferType Buffer;
414*81ad6265SDimitry Andric   auto Opt = nameToCodepoint(Name, false, Buffer);
415*81ad6265SDimitry Andric   if (!Opt)
416*81ad6265SDimitry Andric     return None;
417*81ad6265SDimitry Andric   return LooseMatchingResult{*Opt, Buffer};
418*81ad6265SDimitry Andric }
419*81ad6265SDimitry Andric 
420*81ad6265SDimitry Andric // Find the unicode character whose editing distance to Pattern
421*81ad6265SDimitry Andric // is shortest, using the Wagner–Fischer algorithm.
422*81ad6265SDimitry Andric llvm::SmallVector<MatchForCodepointName>
423*81ad6265SDimitry Andric nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
424*81ad6265SDimitry Andric   // We maintain a fixed size vector of matches,
425*81ad6265SDimitry Andric   // sorted by distance
426*81ad6265SDimitry Andric   // The worst match (with the biggest distance) are discarded when new elements
427*81ad6265SDimitry Andric   // are added.
428*81ad6265SDimitry Andric   std::size_t LargestEditDistance = 0;
429*81ad6265SDimitry Andric   llvm::SmallVector<MatchForCodepointName> Matches;
430*81ad6265SDimitry Andric   Matches.reserve(MaxMatchesCount + 1);
431*81ad6265SDimitry Andric 
432*81ad6265SDimitry Andric   auto Insert = [&](const Node &Node, uint32_t Distance,
433*81ad6265SDimitry Andric                     char32_t Value) -> bool {
434*81ad6265SDimitry Andric     if (Distance > LargestEditDistance) {
435*81ad6265SDimitry Andric       if (Matches.size() == MaxMatchesCount)
436*81ad6265SDimitry Andric         return false;
437*81ad6265SDimitry Andric       LargestEditDistance = Distance;
438*81ad6265SDimitry Andric     }
439*81ad6265SDimitry Andric     // To avoid allocations, the creation of the name is delayed
440*81ad6265SDimitry Andric     // as much as possible.
441*81ad6265SDimitry Andric     std::string Name;
442*81ad6265SDimitry Andric     auto GetName = [&] {
443*81ad6265SDimitry Andric       if (Name.empty())
444*81ad6265SDimitry Andric         Name = Node.fullName();
445*81ad6265SDimitry Andric       return Name;
446*81ad6265SDimitry Andric     };
447*81ad6265SDimitry Andric 
448*81ad6265SDimitry Andric     auto It = std::lower_bound(
449*81ad6265SDimitry Andric         Matches.begin(), Matches.end(), Distance,
450*81ad6265SDimitry Andric         [&](const MatchForCodepointName &a, std::size_t Distance) {
451*81ad6265SDimitry Andric           if (Distance == a.Distance)
452*81ad6265SDimitry Andric             return a.Name < GetName();
453*81ad6265SDimitry Andric           return a.Distance < Distance;
454*81ad6265SDimitry Andric         });
455*81ad6265SDimitry Andric     if (It == Matches.end() && Matches.size() == MaxMatchesCount)
456*81ad6265SDimitry Andric       return false;
457*81ad6265SDimitry Andric 
458*81ad6265SDimitry Andric     MatchForCodepointName M{GetName(), Distance, Value};
459*81ad6265SDimitry Andric     Matches.insert(It, std::move(M));
460*81ad6265SDimitry Andric     if (Matches.size() > MaxMatchesCount)
461*81ad6265SDimitry Andric       Matches.pop_back();
462*81ad6265SDimitry Andric     return true;
463*81ad6265SDimitry Andric   };
464*81ad6265SDimitry Andric 
465*81ad6265SDimitry Andric   // We ignore case, space, hyphens, etc,
466*81ad6265SDimitry Andric   // in both the search pattern and the prospective names.
467*81ad6265SDimitry Andric   auto Normalize = [](StringRef Name) {
468*81ad6265SDimitry Andric     std::string Out;
469*81ad6265SDimitry Andric     Out.reserve(Name.size());
470*81ad6265SDimitry Andric     for (char C : Name) {
471*81ad6265SDimitry Andric       if (isAlnum(C))
472*81ad6265SDimitry Andric         Out.push_back(toUpper(C));
473*81ad6265SDimitry Andric     }
474*81ad6265SDimitry Andric     return Out;
475*81ad6265SDimitry Andric   };
476*81ad6265SDimitry Andric   std::string NormalizedName = Normalize(Pattern);
477*81ad6265SDimitry Andric 
478*81ad6265SDimitry Andric   // Allocate a matrix big enough for longest names.
479*81ad6265SDimitry Andric   const std::size_t Columns =
480*81ad6265SDimitry Andric       std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
481*81ad6265SDimitry Andric       1;
482*81ad6265SDimitry Andric 
483*81ad6265SDimitry Andric   LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
484*81ad6265SDimitry Andric       UnicodeNameToCodepointLargestNameSize + 1;
485*81ad6265SDimitry Andric 
486*81ad6265SDimitry Andric   std::vector<char> Distances(
487*81ad6265SDimitry Andric       Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
488*81ad6265SDimitry Andric 
489*81ad6265SDimitry Andric   auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
490*81ad6265SDimitry Andric     assert(Column < Columns);
491*81ad6265SDimitry Andric     assert(Row < Rows);
492*81ad6265SDimitry Andric     return Distances[Row * Columns + Column];
493*81ad6265SDimitry Andric   };
494*81ad6265SDimitry Andric 
495*81ad6265SDimitry Andric   for (std::size_t I = 0; I < Columns; I++)
496*81ad6265SDimitry Andric     Get(I, 0) = I;
497*81ad6265SDimitry Andric 
498*81ad6265SDimitry Andric   // Visit the childrens,
499*81ad6265SDimitry Andric   // Filling (and overriding) the matrix for the name fragment of each node
500*81ad6265SDimitry Andric   // iteratively. CompleteName is used to collect the actual name of potential
501*81ad6265SDimitry Andric   // match, respecting case and spacing.
502*81ad6265SDimitry Andric   auto VisitNode = [&](const Node &N, std::size_t Row,
503*81ad6265SDimitry Andric                        auto &VisitNode) -> void {
504*81ad6265SDimitry Andric     std::size_t J = 0;
505*81ad6265SDimitry Andric     for (; J < N.Name.size(); J++) {
506*81ad6265SDimitry Andric       if (!isAlnum(N.Name[J]))
507*81ad6265SDimitry Andric         continue;
508*81ad6265SDimitry Andric 
509*81ad6265SDimitry Andric       Get(0, Row) = Row;
510*81ad6265SDimitry Andric 
511*81ad6265SDimitry Andric       for (std::size_t I = 1; I < Columns; I++) {
512*81ad6265SDimitry Andric         const int Delete = Get(I - 1, Row) + 1;
513*81ad6265SDimitry Andric         const int Insert = Get(I, Row - 1) + 1;
514*81ad6265SDimitry Andric 
515*81ad6265SDimitry Andric         const int Replace =
516*81ad6265SDimitry Andric             Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
517*81ad6265SDimitry Andric 
518*81ad6265SDimitry Andric         Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
519*81ad6265SDimitry Andric       }
520*81ad6265SDimitry Andric 
521*81ad6265SDimitry Andric       Row++;
522*81ad6265SDimitry Andric     }
523*81ad6265SDimitry Andric 
524*81ad6265SDimitry Andric     unsigned Cost = Get(Columns - 1, Row - 1);
525*81ad6265SDimitry Andric     if (N.Value != 0xFFFFFFFF) {
526*81ad6265SDimitry Andric       Insert(N, Cost, N.Value);
527*81ad6265SDimitry Andric     }
528*81ad6265SDimitry Andric 
529*81ad6265SDimitry Andric     if (N.hasChildren()) {
530*81ad6265SDimitry Andric       auto ChildOffset = N.ChildrenOffset;
531*81ad6265SDimitry Andric       for (;;) {
532*81ad6265SDimitry Andric         Node C = readNode(ChildOffset, &N);
533*81ad6265SDimitry Andric         ChildOffset += C.Size;
534*81ad6265SDimitry Andric         if (!C.isValid())
535*81ad6265SDimitry Andric           break;
536*81ad6265SDimitry Andric         VisitNode(C, Row, VisitNode);
537*81ad6265SDimitry Andric         if (!C.HasSibling)
538*81ad6265SDimitry Andric           break;
539*81ad6265SDimitry Andric       }
540*81ad6265SDimitry Andric     }
541*81ad6265SDimitry Andric   };
542*81ad6265SDimitry Andric 
543*81ad6265SDimitry Andric   Node Root = createRoot();
544*81ad6265SDimitry Andric   VisitNode(Root, 1, VisitNode);
545*81ad6265SDimitry Andric   return Matches;
546*81ad6265SDimitry Andric }
547*81ad6265SDimitry Andric 
548*81ad6265SDimitry Andric } // namespace unicode
549*81ad6265SDimitry Andric 
550*81ad6265SDimitry Andric } // namespace sys
551*81ad6265SDimitry Andric } // namespace llvm
552