xref: /freebsd-src/contrib/llvm-project/llvm/lib/Support/UnicodeNameToCodepoint.cpp (revision 7a6dacaca14b62ca4b74406814becb87a3fefac0)
181ad6265SDimitry Andric //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
281ad6265SDimitry Andric //-*- C++ -*-===//
381ad6265SDimitry Andric //
481ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
581ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
681ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
781ad6265SDimitry Andric //
881ad6265SDimitry Andric //===----------------------------------------------------------------------===//
981ad6265SDimitry Andric //
1081ad6265SDimitry Andric // This file implements functions to map the name or alias of a unicode
1181ad6265SDimitry Andric // character to its codepoint.
1281ad6265SDimitry Andric //
1381ad6265SDimitry Andric //===----------------------------------------------------------------------===//
1481ad6265SDimitry Andric 
1581ad6265SDimitry Andric #include "llvm/ADT/STLExtras.h"
1681ad6265SDimitry Andric #include "llvm/ADT/StringExtras.h"
1781ad6265SDimitry Andric #include "llvm/ADT/StringRef.h"
1881ad6265SDimitry Andric #include "llvm/Support/Unicode.h"
1981ad6265SDimitry Andric 
2081ad6265SDimitry Andric namespace llvm {
2181ad6265SDimitry Andric namespace sys {
2281ad6265SDimitry Andric namespace unicode {
2381ad6265SDimitry Andric 
2481ad6265SDimitry Andric extern const char *UnicodeNameToCodepointDict;
2581ad6265SDimitry Andric extern const uint8_t *UnicodeNameToCodepointIndex;
2681ad6265SDimitry Andric extern const std::size_t UnicodeNameToCodepointIndexSize;
2781ad6265SDimitry Andric extern const std::size_t UnicodeNameToCodepointLargestNameSize;
2881ad6265SDimitry Andric 
2981ad6265SDimitry Andric using BufferType = SmallString<64>;
3081ad6265SDimitry Andric 
3181ad6265SDimitry Andric struct Node {
3281ad6265SDimitry Andric   bool IsRoot = false;
3381ad6265SDimitry Andric   char32_t Value = 0xFFFFFFFF;
3481ad6265SDimitry Andric   uint32_t ChildrenOffset = 0;
3581ad6265SDimitry Andric   bool HasSibling = false;
3681ad6265SDimitry Andric   uint32_t Size = 0;
3781ad6265SDimitry Andric   StringRef Name;
3881ad6265SDimitry Andric   const Node *Parent = nullptr;
3981ad6265SDimitry Andric 
isValidllvm::sys::unicode::Node4081ad6265SDimitry Andric   constexpr bool isValid() const {
4181ad6265SDimitry Andric     return !Name.empty() || Value == 0xFFFFFFFF;
4281ad6265SDimitry Andric   }
hasChildrenllvm::sys::unicode::Node4381ad6265SDimitry Andric   constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
4481ad6265SDimitry Andric 
fullNamellvm::sys::unicode::Node4581ad6265SDimitry Andric   std::string fullName() const {
4681ad6265SDimitry Andric     std::string S;
4781ad6265SDimitry Andric     // Reserve enough space for most unicode code points.
4881ad6265SDimitry Andric     // The chosen value represent the 99th percentile of name size as of
49bdd1243dSDimitry Andric     // Unicode 15.0.
5081ad6265SDimitry Andric     S.reserve(46);
5181ad6265SDimitry Andric     const Node *N = this;
5281ad6265SDimitry Andric     while (N) {
5381ad6265SDimitry Andric       std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
5481ad6265SDimitry Andric       N = N->Parent;
5581ad6265SDimitry Andric     }
5681ad6265SDimitry Andric     std::reverse(S.begin(), S.end());
5781ad6265SDimitry Andric     return S;
5881ad6265SDimitry Andric   }
5981ad6265SDimitry Andric };
6081ad6265SDimitry Andric 
createRoot()6181ad6265SDimitry Andric static Node createRoot() {
6281ad6265SDimitry Andric   Node N;
6381ad6265SDimitry Andric   N.IsRoot = true;
6481ad6265SDimitry Andric   N.ChildrenOffset = 1;
6581ad6265SDimitry Andric   N.Size = 1;
6681ad6265SDimitry Andric   return N;
6781ad6265SDimitry Andric }
6881ad6265SDimitry Andric 
readNode(uint32_t Offset,const Node * Parent=nullptr)6981ad6265SDimitry Andric static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
7081ad6265SDimitry Andric   if (Offset == 0)
7181ad6265SDimitry Andric     return createRoot();
7281ad6265SDimitry Andric 
7381ad6265SDimitry Andric   uint32_t Origin = Offset;
7481ad6265SDimitry Andric   Node N;
7581ad6265SDimitry Andric   N.Parent = Parent;
7681ad6265SDimitry Andric   uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
7781ad6265SDimitry Andric   if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
7881ad6265SDimitry Andric     return N;
7981ad6265SDimitry Andric 
8081ad6265SDimitry Andric   bool LongName = NameInfo & 0x40;
8181ad6265SDimitry Andric   bool HasValue = NameInfo & 0x80;
8281ad6265SDimitry Andric   std::size_t Size = NameInfo & ~0xC0;
8381ad6265SDimitry Andric   if (LongName) {
8481ad6265SDimitry Andric     uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
8581ad6265SDimitry Andric     NameOffset |= UnicodeNameToCodepointIndex[Offset++];
8681ad6265SDimitry Andric     N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
8781ad6265SDimitry Andric   } else {
8881ad6265SDimitry Andric     N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
8981ad6265SDimitry Andric   }
9081ad6265SDimitry Andric   if (HasValue) {
9181ad6265SDimitry Andric     uint8_t H = UnicodeNameToCodepointIndex[Offset++];
9281ad6265SDimitry Andric     uint8_t M = UnicodeNameToCodepointIndex[Offset++];
9381ad6265SDimitry Andric     uint8_t L = UnicodeNameToCodepointIndex[Offset++];
9481ad6265SDimitry Andric     N.Value = ((H << 16) | (M << 8) | L) >> 3;
9581ad6265SDimitry Andric 
9681ad6265SDimitry Andric     bool HasChildren = L & 0x02;
9781ad6265SDimitry Andric     N.HasSibling = L & 0x01;
9881ad6265SDimitry Andric 
9981ad6265SDimitry Andric     if (HasChildren) {
10081ad6265SDimitry Andric       N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
10181ad6265SDimitry Andric       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
10281ad6265SDimitry Andric       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
10381ad6265SDimitry Andric     }
10481ad6265SDimitry Andric   } else {
10581ad6265SDimitry Andric     uint8_t H = UnicodeNameToCodepointIndex[Offset++];
10681ad6265SDimitry Andric     N.HasSibling = H & 0x80;
10781ad6265SDimitry Andric     bool HasChildren = H & 0x40;
108bdd1243dSDimitry Andric     H &= uint8_t(~0xC0);
10981ad6265SDimitry Andric     if (HasChildren) {
11081ad6265SDimitry Andric       N.ChildrenOffset = (H << 16);
11181ad6265SDimitry Andric       N.ChildrenOffset |=
11281ad6265SDimitry Andric           (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
11381ad6265SDimitry Andric       N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
11481ad6265SDimitry Andric     }
11581ad6265SDimitry Andric   }
11681ad6265SDimitry Andric   N.Size = Offset - Origin;
11781ad6265SDimitry Andric   return N;
11881ad6265SDimitry Andric }
11981ad6265SDimitry Andric 
startsWith(StringRef Name,StringRef Needle,bool Strict,std::size_t & Consummed,char & PreviousCharInName,bool IsPrefix=false)12081ad6265SDimitry Andric static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
12181ad6265SDimitry Andric                        std::size_t &Consummed, char &PreviousCharInName,
1225f757f3fSDimitry Andric                        bool IsPrefix = false) {
12381ad6265SDimitry Andric 
12481ad6265SDimitry Andric   Consummed = 0;
12581ad6265SDimitry Andric   if (Strict) {
1265f757f3fSDimitry Andric     if (!Name.starts_with(Needle))
12781ad6265SDimitry Andric       return false;
12881ad6265SDimitry Andric     Consummed = Needle.size();
12981ad6265SDimitry Andric     return true;
13081ad6265SDimitry Andric   }
13181ad6265SDimitry Andric   if (Needle.empty())
13281ad6265SDimitry Andric     return true;
13381ad6265SDimitry Andric 
13481ad6265SDimitry Andric   auto NamePos = Name.begin();
13581ad6265SDimitry Andric   auto NeedlePos = Needle.begin();
13681ad6265SDimitry Andric 
13781ad6265SDimitry Andric   char PreviousCharInNameOrigin = PreviousCharInName;
1385f757f3fSDimitry Andric   char PreviousCharInNeedle = *Needle.begin();
13981ad6265SDimitry Andric   auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
1405f757f3fSDimitry Andric                          bool IsPrefix = false) {
14181ad6265SDimitry Andric     while (It != End) {
14281ad6265SDimitry Andric       const auto Next = std::next(It);
14381ad6265SDimitry Andric       // Ignore spaces, underscore, medial hyphens
1445f757f3fSDimitry Andric       // The generator ensures a needle never ends (or starts) by a medial
1455f757f3fSDimitry Andric       // hyphen https://unicode.org/reports/tr44/#UAX44-LM2.
14681ad6265SDimitry Andric       bool Ignore =
14781ad6265SDimitry Andric           *It == ' ' || *It == '_' ||
14881ad6265SDimitry Andric           (*It == '-' && isAlnum(PreviousChar) &&
1495f757f3fSDimitry Andric            ((Next != End && isAlnum(*Next)) || (Next == End && IsPrefix)));
15081ad6265SDimitry Andric       PreviousChar = *It;
15181ad6265SDimitry Andric       if (!Ignore)
15281ad6265SDimitry Andric         break;
15381ad6265SDimitry Andric       ++It;
15481ad6265SDimitry Andric     }
15581ad6265SDimitry Andric     return It;
15681ad6265SDimitry Andric   };
15781ad6265SDimitry Andric 
15881ad6265SDimitry Andric   while (true) {
15981ad6265SDimitry Andric     NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
16081ad6265SDimitry Andric     NeedlePos =
16181ad6265SDimitry Andric         IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
16281ad6265SDimitry Andric     if (NeedlePos == Needle.end())
16381ad6265SDimitry Andric       break;
16481ad6265SDimitry Andric     if (NamePos == Name.end())
16581ad6265SDimitry Andric       break;
16681ad6265SDimitry Andric     if (toUpper(*NeedlePos) != toUpper(*NamePos))
16781ad6265SDimitry Andric       break;
16881ad6265SDimitry Andric     NeedlePos++;
16981ad6265SDimitry Andric     NamePos++;
17081ad6265SDimitry Andric   }
17181ad6265SDimitry Andric   Consummed = std::distance(Name.begin(), NamePos);
17281ad6265SDimitry Andric   if (NeedlePos != Needle.end()) {
17381ad6265SDimitry Andric     PreviousCharInName = PreviousCharInNameOrigin;
17481ad6265SDimitry Andric   }
17581ad6265SDimitry Andric   return NeedlePos == Needle.end();
17681ad6265SDimitry Andric }
17781ad6265SDimitry Andric 
17881ad6265SDimitry Andric static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset,StringRef Name,bool Strict,char PreviousCharInName,BufferType & Buffer,const Node * Parent=nullptr)17981ad6265SDimitry Andric compareNode(uint32_t Offset, StringRef Name, bool Strict,
1805f757f3fSDimitry Andric             char PreviousCharInName, BufferType &Buffer,
1815f757f3fSDimitry Andric             const Node *Parent = nullptr) {
18281ad6265SDimitry Andric   Node N = readNode(Offset, Parent);
18381ad6265SDimitry Andric   std::size_t Consummed = 0;
1845f757f3fSDimitry Andric   bool DoesStartWith = N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
1855f757f3fSDimitry Andric                                               PreviousCharInName);
18681ad6265SDimitry Andric   if (!DoesStartWith)
18781ad6265SDimitry Andric     return std::make_tuple(N, false, 0);
18881ad6265SDimitry Andric 
18981ad6265SDimitry Andric   if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
19081ad6265SDimitry Andric     return std::make_tuple(N, true, N.Value);
19181ad6265SDimitry Andric 
19281ad6265SDimitry Andric   if (N.hasChildren()) {
19381ad6265SDimitry Andric     uint32_t ChildOffset = N.ChildrenOffset;
19481ad6265SDimitry Andric     for (;;) {
19581ad6265SDimitry Andric       Node C;
19681ad6265SDimitry Andric       bool Matches;
19781ad6265SDimitry Andric       uint32_t Value;
19881ad6265SDimitry Andric       std::tie(C, Matches, Value) =
19981ad6265SDimitry Andric           compareNode(ChildOffset, Name.substr(Consummed), Strict,
2005f757f3fSDimitry Andric                       PreviousCharInName, Buffer, &N);
20181ad6265SDimitry Andric       if (Matches) {
20281ad6265SDimitry Andric         std::reverse_copy(C.Name.begin(), C.Name.end(),
20381ad6265SDimitry Andric                           std::back_inserter(Buffer));
20481ad6265SDimitry Andric         return std::make_tuple(N, true, Value);
20581ad6265SDimitry Andric       }
20681ad6265SDimitry Andric       ChildOffset += C.Size;
20781ad6265SDimitry Andric       if (!C.HasSibling)
20881ad6265SDimitry Andric         break;
20981ad6265SDimitry Andric     }
21081ad6265SDimitry Andric   }
21181ad6265SDimitry Andric   return std::make_tuple(N, false, 0);
21281ad6265SDimitry Andric }
21381ad6265SDimitry Andric 
21481ad6265SDimitry Andric static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset,StringRef Name,bool Strict,BufferType & Buffer)21581ad6265SDimitry Andric compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
2165f757f3fSDimitry Andric   return compareNode(Offset, Name, Strict, 0, Buffer);
21781ad6265SDimitry Andric }
21881ad6265SDimitry Andric 
21981ad6265SDimitry Andric // clang-format off
22081ad6265SDimitry Andric constexpr const char *const HangulSyllables[][3] = {
22181ad6265SDimitry Andric     { "G",  "A",   ""   },
22281ad6265SDimitry Andric     { "GG", "AE",  "G"  },
22381ad6265SDimitry Andric     { "N",  "YA",  "GG" },
22481ad6265SDimitry Andric     { "D",  "YAE", "GS" },
22581ad6265SDimitry Andric     { "DD", "EO",  "N", },
22681ad6265SDimitry Andric     { "R",  "E",   "NJ" },
22781ad6265SDimitry Andric     { "M",  "YEO", "NH" },
22881ad6265SDimitry Andric     { "B",  "YE",  "D"  },
22981ad6265SDimitry Andric     { "BB", "O",   "L"  },
23081ad6265SDimitry Andric     { "S",  "WA",  "LG" },
23181ad6265SDimitry Andric     { "SS", "WAE", "LM" },
23281ad6265SDimitry Andric     { "",   "OE",  "LB" },
23381ad6265SDimitry Andric     { "J",  "YO",  "LS" },
23481ad6265SDimitry Andric     { "JJ", "U",   "LT" },
23581ad6265SDimitry Andric     { "C",  "WEO", "LP" },
23681ad6265SDimitry Andric     { "K",  "WE",  "LH" },
23781ad6265SDimitry Andric     { "T",  "WI",  "M"  },
23881ad6265SDimitry Andric     { "P",  "YU",  "B"  },
23981ad6265SDimitry Andric     { "H",  "EU",  "BS" },
24081ad6265SDimitry Andric     { 0,    "YI",  "S"  },
24181ad6265SDimitry Andric     { 0,    "I",   "SS" },
24281ad6265SDimitry Andric     { 0,    0,     "NG" },
24381ad6265SDimitry Andric     { 0,    0,     "J"  },
24481ad6265SDimitry Andric     { 0,    0,     "C"  },
24581ad6265SDimitry Andric     { 0,    0,     "K"  },
24681ad6265SDimitry Andric     { 0,    0,     "T"  },
24781ad6265SDimitry Andric     { 0,    0,     "P"  },
24881ad6265SDimitry Andric     { 0,    0,     "H"  }
24981ad6265SDimitry Andric     };
25081ad6265SDimitry Andric // clang-format on
25181ad6265SDimitry Andric 
252bdd1243dSDimitry Andric // Unicode 15.0
25381ad6265SDimitry Andric // 3.12 Conjoining Jamo Behavior Common constants
25481ad6265SDimitry Andric constexpr const char32_t SBase = 0xAC00;
25581ad6265SDimitry Andric constexpr const uint32_t LCount = 19;
25681ad6265SDimitry Andric constexpr const uint32_t VCount = 21;
25781ad6265SDimitry Andric constexpr const uint32_t TCount = 28;
25881ad6265SDimitry Andric 
findSyllable(StringRef Name,bool Strict,char & PreviousInName,int & Pos,int Column)25981ad6265SDimitry Andric static std::size_t findSyllable(StringRef Name, bool Strict,
26081ad6265SDimitry Andric                                 char &PreviousInName, int &Pos, int Column) {
26181ad6265SDimitry Andric   assert(Column == 0 || Column == 1 || Column == 2);
26281ad6265SDimitry Andric   static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
26381ad6265SDimitry Andric   int Len = -1;
26481ad6265SDimitry Andric   int Prev = PreviousInName;
26581ad6265SDimitry Andric   for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
26681ad6265SDimitry Andric     StringRef Syllable(HangulSyllables[I][Column]);
26781ad6265SDimitry Andric     if (int(Syllable.size()) <= Len)
26881ad6265SDimitry Andric       continue;
26981ad6265SDimitry Andric     std::size_t Consummed = 0;
27081ad6265SDimitry Andric     char PreviousInNameCopy = PreviousInName;
2715f757f3fSDimitry Andric     bool DoesStartWith =
2725f757f3fSDimitry Andric         startsWith(Name, Syllable, Strict, Consummed, PreviousInNameCopy);
27381ad6265SDimitry Andric     if (!DoesStartWith)
27481ad6265SDimitry Andric       continue;
27581ad6265SDimitry Andric     Len = Consummed;
27681ad6265SDimitry Andric     Pos = I;
27781ad6265SDimitry Andric     Prev = PreviousInNameCopy;
27881ad6265SDimitry Andric   }
27981ad6265SDimitry Andric   if (Len == -1)
28081ad6265SDimitry Andric     return 0;
28181ad6265SDimitry Andric   PreviousInName = Prev;
28281ad6265SDimitry Andric   return size_t(Len);
28381ad6265SDimitry Andric }
28481ad6265SDimitry Andric 
285bdd1243dSDimitry Andric static std::optional<char32_t>
nameToHangulCodePoint(StringRef Name,bool Strict,BufferType & Buffer)28681ad6265SDimitry Andric nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
28781ad6265SDimitry Andric   Buffer.clear();
28881ad6265SDimitry Andric   // Hangul Syllable Decomposition
28981ad6265SDimitry Andric   std::size_t Consummed = 0;
2905f757f3fSDimitry Andric   char NameStart = 0;
2915f757f3fSDimitry Andric   bool DoesStartWith =
2925f757f3fSDimitry Andric       startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, NameStart);
29381ad6265SDimitry Andric   if (!DoesStartWith)
294bdd1243dSDimitry Andric     return std::nullopt;
29581ad6265SDimitry Andric   Name = Name.substr(Consummed);
29681ad6265SDimitry Andric   int L = -1, V = -1, T = -1;
29781ad6265SDimitry Andric   Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
29881ad6265SDimitry Andric   Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
29981ad6265SDimitry Andric   Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
30081ad6265SDimitry Andric   if (L != -1 && V != -1 && T != -1 && Name.empty()) {
30181ad6265SDimitry Andric     if (!Strict) {
30281ad6265SDimitry Andric       Buffer.append("HANGUL SYLLABLE ");
30381ad6265SDimitry Andric       if (L != -1)
30481ad6265SDimitry Andric         Buffer.append(HangulSyllables[L][0]);
30581ad6265SDimitry Andric       if (V != -1)
30681ad6265SDimitry Andric         Buffer.append(HangulSyllables[V][1]);
30781ad6265SDimitry Andric       if (T != -1)
30881ad6265SDimitry Andric         Buffer.append(HangulSyllables[T][2]);
30981ad6265SDimitry Andric     }
31081ad6265SDimitry Andric     return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
31181ad6265SDimitry Andric            std::uint32_t(T);
31281ad6265SDimitry Andric   }
31381ad6265SDimitry Andric   // Otherwise, it's an illegal syllable name.
314bdd1243dSDimitry Andric   return std::nullopt;
31581ad6265SDimitry Andric }
31681ad6265SDimitry Andric 
31781ad6265SDimitry Andric struct GeneratedNamesData {
31881ad6265SDimitry Andric   StringRef Prefix;
31981ad6265SDimitry Andric   uint32_t Start;
32081ad6265SDimitry Andric   uint32_t End;
32181ad6265SDimitry Andric };
32281ad6265SDimitry Andric 
323*7a6dacacSDimitry Andric // Unicode 15.1 Table 4-8. Name Derivation Rule Prefix Strings
32481ad6265SDimitry Andric static const GeneratedNamesData GeneratedNamesDataTable[] = {
32581ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
326bdd1243dSDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
327bdd1243dSDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
328bdd1243dSDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
32981ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
33081ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
33181ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
332*7a6dacacSDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x2EBF0, 0x2EE5D},
33381ad6265SDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
334bdd1243dSDimitry Andric     {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
33581ad6265SDimitry Andric     {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
33681ad6265SDimitry Andric     {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
33781ad6265SDimitry Andric     {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
33881ad6265SDimitry Andric     {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
33981ad6265SDimitry Andric     {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
34081ad6265SDimitry Andric     {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
34181ad6265SDimitry Andric     {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
34281ad6265SDimitry Andric };
34381ad6265SDimitry Andric 
344bdd1243dSDimitry Andric static std::optional<char32_t>
nameToGeneratedCodePoint(StringRef Name,bool Strict,BufferType & Buffer)34581ad6265SDimitry Andric nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
34681ad6265SDimitry Andric   for (auto &&Item : GeneratedNamesDataTable) {
34781ad6265SDimitry Andric     Buffer.clear();
34881ad6265SDimitry Andric     std::size_t Consummed = 0;
3495f757f3fSDimitry Andric     char NameStart = 0;
35081ad6265SDimitry Andric     bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
3515f757f3fSDimitry Andric                                     NameStart, /*IsPrefix=*/true);
35281ad6265SDimitry Andric     if (!DoesStartWith)
35381ad6265SDimitry Andric       continue;
35481ad6265SDimitry Andric     auto Number = Name.substr(Consummed);
35581ad6265SDimitry Andric     unsigned long long V = 0;
35681ad6265SDimitry Andric     // Be consistent about mandating upper casing.
35781ad6265SDimitry Andric     if (Strict &&
35881ad6265SDimitry Andric         llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
35981ad6265SDimitry Andric       return {};
36081ad6265SDimitry Andric     if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
36181ad6265SDimitry Andric       continue;
36281ad6265SDimitry Andric     if (!Strict) {
36381ad6265SDimitry Andric       Buffer.append(Item.Prefix);
36481ad6265SDimitry Andric       Buffer.append(utohexstr(V, true));
36581ad6265SDimitry Andric     }
36681ad6265SDimitry Andric     return V;
36781ad6265SDimitry Andric   }
368bdd1243dSDimitry Andric   return std::nullopt;
36981ad6265SDimitry Andric }
37081ad6265SDimitry Andric 
nameToCodepoint(StringRef Name,bool Strict,BufferType & Buffer)371bdd1243dSDimitry Andric static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
37281ad6265SDimitry Andric                                                BufferType &Buffer) {
37381ad6265SDimitry Andric   if (Name.empty())
374bdd1243dSDimitry Andric     return std::nullopt;
37581ad6265SDimitry Andric 
376bdd1243dSDimitry Andric   std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
37781ad6265SDimitry Andric   if (!Res)
37881ad6265SDimitry Andric     Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
37981ad6265SDimitry Andric   if (Res)
38081ad6265SDimitry Andric     return *Res;
38181ad6265SDimitry Andric 
38281ad6265SDimitry Andric   Buffer.clear();
38381ad6265SDimitry Andric   Node Node;
38481ad6265SDimitry Andric   bool Matches;
38581ad6265SDimitry Andric   uint32_t Value;
38681ad6265SDimitry Andric   std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
38781ad6265SDimitry Andric   if (Matches) {
38881ad6265SDimitry Andric     std::reverse(Buffer.begin(), Buffer.end());
38981ad6265SDimitry Andric     // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
39081ad6265SDimitry Andric     // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
3915f757f3fSDimitry Andric     if (!Strict && Value == 0x116c && Name.contains_insensitive("O-E")) {
39281ad6265SDimitry Andric       Buffer = "HANGUL JUNGSEONG O-E";
39381ad6265SDimitry Andric       Value = 0x1180;
39481ad6265SDimitry Andric     }
39581ad6265SDimitry Andric     return Value;
39681ad6265SDimitry Andric   }
397bdd1243dSDimitry Andric   return std::nullopt;
39881ad6265SDimitry Andric }
39981ad6265SDimitry Andric 
nameToCodepointStrict(StringRef Name)400bdd1243dSDimitry Andric std::optional<char32_t> nameToCodepointStrict(StringRef Name) {
40181ad6265SDimitry Andric 
40281ad6265SDimitry Andric   BufferType Buffer;
40381ad6265SDimitry Andric   auto Opt = nameToCodepoint(Name, true, Buffer);
40481ad6265SDimitry Andric   return Opt;
40581ad6265SDimitry Andric }
40681ad6265SDimitry Andric 
407bdd1243dSDimitry Andric std::optional<LooseMatchingResult>
nameToCodepointLooseMatching(StringRef Name)40881ad6265SDimitry Andric nameToCodepointLooseMatching(StringRef Name) {
40981ad6265SDimitry Andric   BufferType Buffer;
41081ad6265SDimitry Andric   auto Opt = nameToCodepoint(Name, false, Buffer);
41181ad6265SDimitry Andric   if (!Opt)
412bdd1243dSDimitry Andric     return std::nullopt;
41381ad6265SDimitry Andric   return LooseMatchingResult{*Opt, Buffer};
41481ad6265SDimitry Andric }
41581ad6265SDimitry Andric 
41681ad6265SDimitry Andric // Find the unicode character whose editing distance to Pattern
41781ad6265SDimitry Andric // is shortest, using the Wagner–Fischer algorithm.
41881ad6265SDimitry Andric llvm::SmallVector<MatchForCodepointName>
nearestMatchesForCodepointName(StringRef Pattern,std::size_t MaxMatchesCount)41981ad6265SDimitry Andric nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
42081ad6265SDimitry Andric   // We maintain a fixed size vector of matches,
42181ad6265SDimitry Andric   // sorted by distance
42281ad6265SDimitry Andric   // The worst match (with the biggest distance) are discarded when new elements
42381ad6265SDimitry Andric   // are added.
42481ad6265SDimitry Andric   std::size_t LargestEditDistance = 0;
42581ad6265SDimitry Andric   llvm::SmallVector<MatchForCodepointName> Matches;
42681ad6265SDimitry Andric   Matches.reserve(MaxMatchesCount + 1);
42781ad6265SDimitry Andric 
42881ad6265SDimitry Andric   auto Insert = [&](const Node &Node, uint32_t Distance,
42981ad6265SDimitry Andric                     char32_t Value) -> bool {
43081ad6265SDimitry Andric     if (Distance > LargestEditDistance) {
43181ad6265SDimitry Andric       if (Matches.size() == MaxMatchesCount)
43281ad6265SDimitry Andric         return false;
43381ad6265SDimitry Andric       LargestEditDistance = Distance;
43481ad6265SDimitry Andric     }
43581ad6265SDimitry Andric     // To avoid allocations, the creation of the name is delayed
43681ad6265SDimitry Andric     // as much as possible.
43781ad6265SDimitry Andric     std::string Name;
43881ad6265SDimitry Andric     auto GetName = [&] {
43981ad6265SDimitry Andric       if (Name.empty())
44081ad6265SDimitry Andric         Name = Node.fullName();
44181ad6265SDimitry Andric       return Name;
44281ad6265SDimitry Andric     };
44381ad6265SDimitry Andric 
444bdd1243dSDimitry Andric     auto It = llvm::lower_bound(
445bdd1243dSDimitry Andric         Matches, Distance,
44681ad6265SDimitry Andric         [&](const MatchForCodepointName &a, std::size_t Distance) {
44781ad6265SDimitry Andric           if (Distance == a.Distance)
44881ad6265SDimitry Andric             return a.Name < GetName();
44981ad6265SDimitry Andric           return a.Distance < Distance;
45081ad6265SDimitry Andric         });
45181ad6265SDimitry Andric     if (It == Matches.end() && Matches.size() == MaxMatchesCount)
45281ad6265SDimitry Andric       return false;
45381ad6265SDimitry Andric 
45481ad6265SDimitry Andric     MatchForCodepointName M{GetName(), Distance, Value};
45581ad6265SDimitry Andric     Matches.insert(It, std::move(M));
45681ad6265SDimitry Andric     if (Matches.size() > MaxMatchesCount)
45781ad6265SDimitry Andric       Matches.pop_back();
45881ad6265SDimitry Andric     return true;
45981ad6265SDimitry Andric   };
46081ad6265SDimitry Andric 
46181ad6265SDimitry Andric   // We ignore case, space, hyphens, etc,
46281ad6265SDimitry Andric   // in both the search pattern and the prospective names.
46381ad6265SDimitry Andric   auto Normalize = [](StringRef Name) {
46481ad6265SDimitry Andric     std::string Out;
46581ad6265SDimitry Andric     Out.reserve(Name.size());
46681ad6265SDimitry Andric     for (char C : Name) {
46781ad6265SDimitry Andric       if (isAlnum(C))
46881ad6265SDimitry Andric         Out.push_back(toUpper(C));
46981ad6265SDimitry Andric     }
47081ad6265SDimitry Andric     return Out;
47181ad6265SDimitry Andric   };
47281ad6265SDimitry Andric   std::string NormalizedName = Normalize(Pattern);
47381ad6265SDimitry Andric 
47481ad6265SDimitry Andric   // Allocate a matrix big enough for longest names.
47581ad6265SDimitry Andric   const std::size_t Columns =
47681ad6265SDimitry Andric       std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
47781ad6265SDimitry Andric       1;
47881ad6265SDimitry Andric 
47981ad6265SDimitry Andric   LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
48081ad6265SDimitry Andric       UnicodeNameToCodepointLargestNameSize + 1;
48181ad6265SDimitry Andric 
48281ad6265SDimitry Andric   std::vector<char> Distances(
48381ad6265SDimitry Andric       Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
48481ad6265SDimitry Andric 
48581ad6265SDimitry Andric   auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
48681ad6265SDimitry Andric     assert(Column < Columns);
48781ad6265SDimitry Andric     assert(Row < Rows);
48881ad6265SDimitry Andric     return Distances[Row * Columns + Column];
48981ad6265SDimitry Andric   };
49081ad6265SDimitry Andric 
49181ad6265SDimitry Andric   for (std::size_t I = 0; I < Columns; I++)
49281ad6265SDimitry Andric     Get(I, 0) = I;
49381ad6265SDimitry Andric 
49481ad6265SDimitry Andric   // Visit the childrens,
49581ad6265SDimitry Andric   // Filling (and overriding) the matrix for the name fragment of each node
49681ad6265SDimitry Andric   // iteratively. CompleteName is used to collect the actual name of potential
49781ad6265SDimitry Andric   // match, respecting case and spacing.
49881ad6265SDimitry Andric   auto VisitNode = [&](const Node &N, std::size_t Row,
49981ad6265SDimitry Andric                        auto &VisitNode) -> void {
50081ad6265SDimitry Andric     std::size_t J = 0;
50181ad6265SDimitry Andric     for (; J < N.Name.size(); J++) {
50281ad6265SDimitry Andric       if (!isAlnum(N.Name[J]))
50381ad6265SDimitry Andric         continue;
50481ad6265SDimitry Andric 
50581ad6265SDimitry Andric       Get(0, Row) = Row;
50681ad6265SDimitry Andric 
50781ad6265SDimitry Andric       for (std::size_t I = 1; I < Columns; I++) {
50881ad6265SDimitry Andric         const int Delete = Get(I - 1, Row) + 1;
50981ad6265SDimitry Andric         const int Insert = Get(I, Row - 1) + 1;
51081ad6265SDimitry Andric 
51181ad6265SDimitry Andric         const int Replace =
51281ad6265SDimitry Andric             Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
51381ad6265SDimitry Andric 
51481ad6265SDimitry Andric         Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
51581ad6265SDimitry Andric       }
51681ad6265SDimitry Andric 
51781ad6265SDimitry Andric       Row++;
51881ad6265SDimitry Andric     }
51981ad6265SDimitry Andric 
52081ad6265SDimitry Andric     unsigned Cost = Get(Columns - 1, Row - 1);
52181ad6265SDimitry Andric     if (N.Value != 0xFFFFFFFF) {
52281ad6265SDimitry Andric       Insert(N, Cost, N.Value);
52381ad6265SDimitry Andric     }
52481ad6265SDimitry Andric 
52581ad6265SDimitry Andric     if (N.hasChildren()) {
52681ad6265SDimitry Andric       auto ChildOffset = N.ChildrenOffset;
52781ad6265SDimitry Andric       for (;;) {
52881ad6265SDimitry Andric         Node C = readNode(ChildOffset, &N);
52981ad6265SDimitry Andric         ChildOffset += C.Size;
53081ad6265SDimitry Andric         if (!C.isValid())
53181ad6265SDimitry Andric           break;
53281ad6265SDimitry Andric         VisitNode(C, Row, VisitNode);
53381ad6265SDimitry Andric         if (!C.HasSibling)
53481ad6265SDimitry Andric           break;
53581ad6265SDimitry Andric       }
53681ad6265SDimitry Andric     }
53781ad6265SDimitry Andric   };
53881ad6265SDimitry Andric 
53981ad6265SDimitry Andric   Node Root = createRoot();
54081ad6265SDimitry Andric   VisitNode(Root, 1, VisitNode);
54181ad6265SDimitry Andric   return Matches;
54281ad6265SDimitry Andric }
54381ad6265SDimitry Andric 
54481ad6265SDimitry Andric } // namespace unicode
54581ad6265SDimitry Andric 
54681ad6265SDimitry Andric } // namespace sys
54781ad6265SDimitry Andric } // namespace llvm
548