181ad6265SDimitry Andric //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties 281ad6265SDimitry Andric //-*- C++ -*-===// 381ad6265SDimitry Andric // 481ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 581ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 681ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 781ad6265SDimitry Andric // 881ad6265SDimitry Andric //===----------------------------------------------------------------------===// 981ad6265SDimitry Andric // 1081ad6265SDimitry Andric // This file implements functions to map the name or alias of a unicode 1181ad6265SDimitry Andric // character to its codepoint. 1281ad6265SDimitry Andric // 1381ad6265SDimitry Andric //===----------------------------------------------------------------------===// 1481ad6265SDimitry Andric 1581ad6265SDimitry Andric #include "llvm/ADT/STLExtras.h" 1681ad6265SDimitry Andric #include "llvm/ADT/StringExtras.h" 1781ad6265SDimitry Andric #include "llvm/ADT/StringRef.h" 1881ad6265SDimitry Andric #include "llvm/Support/Unicode.h" 1981ad6265SDimitry Andric 2081ad6265SDimitry Andric namespace llvm { 2181ad6265SDimitry Andric namespace sys { 2281ad6265SDimitry Andric namespace unicode { 2381ad6265SDimitry Andric 2481ad6265SDimitry Andric extern const char *UnicodeNameToCodepointDict; 2581ad6265SDimitry Andric extern const uint8_t *UnicodeNameToCodepointIndex; 2681ad6265SDimitry Andric extern const std::size_t UnicodeNameToCodepointIndexSize; 2781ad6265SDimitry Andric extern const std::size_t UnicodeNameToCodepointLargestNameSize; 2881ad6265SDimitry Andric 2981ad6265SDimitry Andric using BufferType = SmallString<64>; 3081ad6265SDimitry Andric 3181ad6265SDimitry Andric struct Node { 3281ad6265SDimitry Andric bool IsRoot = false; 3381ad6265SDimitry Andric char32_t Value = 0xFFFFFFFF; 3481ad6265SDimitry Andric uint32_t ChildrenOffset = 0; 3581ad6265SDimitry Andric bool HasSibling = false; 3681ad6265SDimitry Andric uint32_t Size = 0; 3781ad6265SDimitry Andric StringRef Name; 3881ad6265SDimitry Andric const Node *Parent = nullptr; 3981ad6265SDimitry Andric 4081ad6265SDimitry Andric constexpr bool isValid() const { 4181ad6265SDimitry Andric return !Name.empty() || Value == 0xFFFFFFFF; 4281ad6265SDimitry Andric } 4381ad6265SDimitry Andric constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; } 4481ad6265SDimitry Andric 4581ad6265SDimitry Andric std::string fullName() const { 4681ad6265SDimitry Andric std::string S; 4781ad6265SDimitry Andric // Reserve enough space for most unicode code points. 4881ad6265SDimitry Andric // The chosen value represent the 99th percentile of name size as of 49*bdd1243dSDimitry Andric // Unicode 15.0. 5081ad6265SDimitry Andric S.reserve(46); 5181ad6265SDimitry Andric const Node *N = this; 5281ad6265SDimitry Andric while (N) { 5381ad6265SDimitry Andric std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S)); 5481ad6265SDimitry Andric N = N->Parent; 5581ad6265SDimitry Andric } 5681ad6265SDimitry Andric std::reverse(S.begin(), S.end()); 5781ad6265SDimitry Andric return S; 5881ad6265SDimitry Andric } 5981ad6265SDimitry Andric }; 6081ad6265SDimitry Andric 6181ad6265SDimitry Andric static Node createRoot() { 6281ad6265SDimitry Andric Node N; 6381ad6265SDimitry Andric N.IsRoot = true; 6481ad6265SDimitry Andric N.ChildrenOffset = 1; 6581ad6265SDimitry Andric N.Size = 1; 6681ad6265SDimitry Andric return N; 6781ad6265SDimitry Andric } 6881ad6265SDimitry Andric 6981ad6265SDimitry Andric static Node readNode(uint32_t Offset, const Node *Parent = nullptr) { 7081ad6265SDimitry Andric if (Offset == 0) 7181ad6265SDimitry Andric return createRoot(); 7281ad6265SDimitry Andric 7381ad6265SDimitry Andric uint32_t Origin = Offset; 7481ad6265SDimitry Andric Node N; 7581ad6265SDimitry Andric N.Parent = Parent; 7681ad6265SDimitry Andric uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++]; 7781ad6265SDimitry Andric if (Offset + 6 >= UnicodeNameToCodepointIndexSize) 7881ad6265SDimitry Andric return N; 7981ad6265SDimitry Andric 8081ad6265SDimitry Andric bool LongName = NameInfo & 0x40; 8181ad6265SDimitry Andric bool HasValue = NameInfo & 0x80; 8281ad6265SDimitry Andric std::size_t Size = NameInfo & ~0xC0; 8381ad6265SDimitry Andric if (LongName) { 8481ad6265SDimitry Andric uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8); 8581ad6265SDimitry Andric NameOffset |= UnicodeNameToCodepointIndex[Offset++]; 8681ad6265SDimitry Andric N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size); 8781ad6265SDimitry Andric } else { 8881ad6265SDimitry Andric N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1); 8981ad6265SDimitry Andric } 9081ad6265SDimitry Andric if (HasValue) { 9181ad6265SDimitry Andric uint8_t H = UnicodeNameToCodepointIndex[Offset++]; 9281ad6265SDimitry Andric uint8_t M = UnicodeNameToCodepointIndex[Offset++]; 9381ad6265SDimitry Andric uint8_t L = UnicodeNameToCodepointIndex[Offset++]; 9481ad6265SDimitry Andric N.Value = ((H << 16) | (M << 8) | L) >> 3; 9581ad6265SDimitry Andric 9681ad6265SDimitry Andric bool HasChildren = L & 0x02; 9781ad6265SDimitry Andric N.HasSibling = L & 0x01; 9881ad6265SDimitry Andric 9981ad6265SDimitry Andric if (HasChildren) { 10081ad6265SDimitry Andric N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16; 10181ad6265SDimitry Andric N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8; 10281ad6265SDimitry Andric N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; 10381ad6265SDimitry Andric } 10481ad6265SDimitry Andric } else { 10581ad6265SDimitry Andric uint8_t H = UnicodeNameToCodepointIndex[Offset++]; 10681ad6265SDimitry Andric N.HasSibling = H & 0x80; 10781ad6265SDimitry Andric bool HasChildren = H & 0x40; 108*bdd1243dSDimitry Andric H &= uint8_t(~0xC0); 10981ad6265SDimitry Andric if (HasChildren) { 11081ad6265SDimitry Andric N.ChildrenOffset = (H << 16); 11181ad6265SDimitry Andric N.ChildrenOffset |= 11281ad6265SDimitry Andric (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8); 11381ad6265SDimitry Andric N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++]; 11481ad6265SDimitry Andric } 11581ad6265SDimitry Andric } 11681ad6265SDimitry Andric N.Size = Offset - Origin; 11781ad6265SDimitry Andric return N; 11881ad6265SDimitry Andric } 11981ad6265SDimitry Andric 12081ad6265SDimitry Andric static bool startsWith(StringRef Name, StringRef Needle, bool Strict, 12181ad6265SDimitry Andric std::size_t &Consummed, char &PreviousCharInName, 12281ad6265SDimitry Andric char &PreviousCharInNeedle, bool IsPrefix = false) { 12381ad6265SDimitry Andric 12481ad6265SDimitry Andric Consummed = 0; 12581ad6265SDimitry Andric if (Strict) { 12681ad6265SDimitry Andric if (!Name.startswith(Needle)) 12781ad6265SDimitry Andric return false; 12881ad6265SDimitry Andric Consummed = Needle.size(); 12981ad6265SDimitry Andric return true; 13081ad6265SDimitry Andric } 13181ad6265SDimitry Andric if (Needle.empty()) 13281ad6265SDimitry Andric return true; 13381ad6265SDimitry Andric 13481ad6265SDimitry Andric auto NamePos = Name.begin(); 13581ad6265SDimitry Andric auto NeedlePos = Needle.begin(); 13681ad6265SDimitry Andric 13781ad6265SDimitry Andric char PreviousCharInNameOrigin = PreviousCharInName; 13881ad6265SDimitry Andric char PreviousCharInNeedleOrigin = PreviousCharInNeedle; 13981ad6265SDimitry Andric 14081ad6265SDimitry Andric auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar, 14181ad6265SDimitry Andric bool IgnoreEnd = false) { 14281ad6265SDimitry Andric while (It != End) { 14381ad6265SDimitry Andric const auto Next = std::next(It); 14481ad6265SDimitry Andric // Ignore spaces, underscore, medial hyphens 14581ad6265SDimitry Andric // https://unicode.org/reports/tr44/#UAX44-LM2. 14681ad6265SDimitry Andric bool Ignore = 14781ad6265SDimitry Andric *It == ' ' || *It == '_' || 14881ad6265SDimitry Andric (*It == '-' && isAlnum(PreviousChar) && 14981ad6265SDimitry Andric ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd))); 15081ad6265SDimitry Andric PreviousChar = *It; 15181ad6265SDimitry Andric if (!Ignore) 15281ad6265SDimitry Andric break; 15381ad6265SDimitry Andric ++It; 15481ad6265SDimitry Andric } 15581ad6265SDimitry Andric return It; 15681ad6265SDimitry Andric }; 15781ad6265SDimitry Andric 15881ad6265SDimitry Andric while (true) { 15981ad6265SDimitry Andric NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName); 16081ad6265SDimitry Andric NeedlePos = 16181ad6265SDimitry Andric IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix); 16281ad6265SDimitry Andric if (NeedlePos == Needle.end()) 16381ad6265SDimitry Andric break; 16481ad6265SDimitry Andric if (NamePos == Name.end()) 16581ad6265SDimitry Andric break; 16681ad6265SDimitry Andric if (toUpper(*NeedlePos) != toUpper(*NamePos)) 16781ad6265SDimitry Andric break; 16881ad6265SDimitry Andric NeedlePos++; 16981ad6265SDimitry Andric NamePos++; 17081ad6265SDimitry Andric } 17181ad6265SDimitry Andric Consummed = std::distance(Name.begin(), NamePos); 17281ad6265SDimitry Andric if (NeedlePos != Needle.end()) { 17381ad6265SDimitry Andric PreviousCharInName = PreviousCharInNameOrigin; 17481ad6265SDimitry Andric PreviousCharInNeedle = PreviousCharInNeedleOrigin; 17581ad6265SDimitry Andric } 17681ad6265SDimitry Andric return NeedlePos == Needle.end(); 17781ad6265SDimitry Andric } 17881ad6265SDimitry Andric 17981ad6265SDimitry Andric static std::tuple<Node, bool, uint32_t> 18081ad6265SDimitry Andric compareNode(uint32_t Offset, StringRef Name, bool Strict, 18181ad6265SDimitry Andric char PreviousCharInName, char PreviousCharInNeedle, 18281ad6265SDimitry Andric BufferType &Buffer, const Node *Parent = nullptr) { 18381ad6265SDimitry Andric Node N = readNode(Offset, Parent); 18481ad6265SDimitry Andric std::size_t Consummed = 0; 18581ad6265SDimitry Andric bool DoesStartWith = 18681ad6265SDimitry Andric N.IsRoot || startsWith(Name, N.Name, Strict, Consummed, 18781ad6265SDimitry Andric PreviousCharInName, PreviousCharInNeedle); 18881ad6265SDimitry Andric if (!DoesStartWith) 18981ad6265SDimitry Andric return std::make_tuple(N, false, 0); 19081ad6265SDimitry Andric 19181ad6265SDimitry Andric if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF) 19281ad6265SDimitry Andric return std::make_tuple(N, true, N.Value); 19381ad6265SDimitry Andric 19481ad6265SDimitry Andric if (N.hasChildren()) { 19581ad6265SDimitry Andric uint32_t ChildOffset = N.ChildrenOffset; 19681ad6265SDimitry Andric for (;;) { 19781ad6265SDimitry Andric Node C; 19881ad6265SDimitry Andric bool Matches; 19981ad6265SDimitry Andric uint32_t Value; 20081ad6265SDimitry Andric std::tie(C, Matches, Value) = 20181ad6265SDimitry Andric compareNode(ChildOffset, Name.substr(Consummed), Strict, 20281ad6265SDimitry Andric PreviousCharInName, PreviousCharInNeedle, Buffer, &N); 20381ad6265SDimitry Andric if (Matches) { 20481ad6265SDimitry Andric std::reverse_copy(C.Name.begin(), C.Name.end(), 20581ad6265SDimitry Andric std::back_inserter(Buffer)); 20681ad6265SDimitry Andric return std::make_tuple(N, true, Value); 20781ad6265SDimitry Andric } 20881ad6265SDimitry Andric ChildOffset += C.Size; 20981ad6265SDimitry Andric if (!C.HasSibling) 21081ad6265SDimitry Andric break; 21181ad6265SDimitry Andric } 21281ad6265SDimitry Andric } 21381ad6265SDimitry Andric return std::make_tuple(N, false, 0); 21481ad6265SDimitry Andric } 21581ad6265SDimitry Andric 21681ad6265SDimitry Andric static std::tuple<Node, bool, uint32_t> 21781ad6265SDimitry Andric compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) { 21881ad6265SDimitry Andric return compareNode(Offset, Name, Strict, 0, 0, Buffer); 21981ad6265SDimitry Andric } 22081ad6265SDimitry Andric 22181ad6265SDimitry Andric // clang-format off 22281ad6265SDimitry Andric constexpr const char *const HangulSyllables[][3] = { 22381ad6265SDimitry Andric { "G", "A", "" }, 22481ad6265SDimitry Andric { "GG", "AE", "G" }, 22581ad6265SDimitry Andric { "N", "YA", "GG" }, 22681ad6265SDimitry Andric { "D", "YAE", "GS" }, 22781ad6265SDimitry Andric { "DD", "EO", "N", }, 22881ad6265SDimitry Andric { "R", "E", "NJ" }, 22981ad6265SDimitry Andric { "M", "YEO", "NH" }, 23081ad6265SDimitry Andric { "B", "YE", "D" }, 23181ad6265SDimitry Andric { "BB", "O", "L" }, 23281ad6265SDimitry Andric { "S", "WA", "LG" }, 23381ad6265SDimitry Andric { "SS", "WAE", "LM" }, 23481ad6265SDimitry Andric { "", "OE", "LB" }, 23581ad6265SDimitry Andric { "J", "YO", "LS" }, 23681ad6265SDimitry Andric { "JJ", "U", "LT" }, 23781ad6265SDimitry Andric { "C", "WEO", "LP" }, 23881ad6265SDimitry Andric { "K", "WE", "LH" }, 23981ad6265SDimitry Andric { "T", "WI", "M" }, 24081ad6265SDimitry Andric { "P", "YU", "B" }, 24181ad6265SDimitry Andric { "H", "EU", "BS" }, 24281ad6265SDimitry Andric { 0, "YI", "S" }, 24381ad6265SDimitry Andric { 0, "I", "SS" }, 24481ad6265SDimitry Andric { 0, 0, "NG" }, 24581ad6265SDimitry Andric { 0, 0, "J" }, 24681ad6265SDimitry Andric { 0, 0, "C" }, 24781ad6265SDimitry Andric { 0, 0, "K" }, 24881ad6265SDimitry Andric { 0, 0, "T" }, 24981ad6265SDimitry Andric { 0, 0, "P" }, 25081ad6265SDimitry Andric { 0, 0, "H" } 25181ad6265SDimitry Andric }; 25281ad6265SDimitry Andric // clang-format on 25381ad6265SDimitry Andric 254*bdd1243dSDimitry Andric // Unicode 15.0 25581ad6265SDimitry Andric // 3.12 Conjoining Jamo Behavior Common constants 25681ad6265SDimitry Andric constexpr const char32_t SBase = 0xAC00; 25781ad6265SDimitry Andric constexpr const uint32_t LCount = 19; 25881ad6265SDimitry Andric constexpr const uint32_t VCount = 21; 25981ad6265SDimitry Andric constexpr const uint32_t TCount = 28; 26081ad6265SDimitry Andric 26181ad6265SDimitry Andric static std::size_t findSyllable(StringRef Name, bool Strict, 26281ad6265SDimitry Andric char &PreviousInName, int &Pos, int Column) { 26381ad6265SDimitry Andric assert(Column == 0 || Column == 1 || Column == 2); 26481ad6265SDimitry Andric static std::size_t CountPerColumn[] = {LCount, VCount, TCount}; 26581ad6265SDimitry Andric char NeedleStart = 0; 26681ad6265SDimitry Andric int Len = -1; 26781ad6265SDimitry Andric int Prev = PreviousInName; 26881ad6265SDimitry Andric for (std::size_t I = 0; I < CountPerColumn[Column]; I++) { 26981ad6265SDimitry Andric StringRef Syllable(HangulSyllables[I][Column]); 27081ad6265SDimitry Andric if (int(Syllable.size()) <= Len) 27181ad6265SDimitry Andric continue; 27281ad6265SDimitry Andric std::size_t Consummed = 0; 27381ad6265SDimitry Andric char PreviousInNameCopy = PreviousInName; 27481ad6265SDimitry Andric bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed, 27581ad6265SDimitry Andric PreviousInNameCopy, NeedleStart); 27681ad6265SDimitry Andric if (!DoesStartWith) 27781ad6265SDimitry Andric continue; 27881ad6265SDimitry Andric Len = Consummed; 27981ad6265SDimitry Andric Pos = I; 28081ad6265SDimitry Andric Prev = PreviousInNameCopy; 28181ad6265SDimitry Andric } 28281ad6265SDimitry Andric if (Len == -1) 28381ad6265SDimitry Andric return 0; 28481ad6265SDimitry Andric PreviousInName = Prev; 28581ad6265SDimitry Andric return size_t(Len); 28681ad6265SDimitry Andric } 28781ad6265SDimitry Andric 288*bdd1243dSDimitry Andric static std::optional<char32_t> 28981ad6265SDimitry Andric nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { 29081ad6265SDimitry Andric Buffer.clear(); 29181ad6265SDimitry Andric // Hangul Syllable Decomposition 29281ad6265SDimitry Andric std::size_t Consummed = 0; 29381ad6265SDimitry Andric char NameStart = 0, NeedleStart = 0; 29481ad6265SDimitry Andric bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, 29581ad6265SDimitry Andric NameStart, NeedleStart); 29681ad6265SDimitry Andric if (!DoesStartWith) 297*bdd1243dSDimitry Andric return std::nullopt; 29881ad6265SDimitry Andric Name = Name.substr(Consummed); 29981ad6265SDimitry Andric int L = -1, V = -1, T = -1; 30081ad6265SDimitry Andric Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0)); 30181ad6265SDimitry Andric Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1)); 30281ad6265SDimitry Andric Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2)); 30381ad6265SDimitry Andric if (L != -1 && V != -1 && T != -1 && Name.empty()) { 30481ad6265SDimitry Andric if (!Strict) { 30581ad6265SDimitry Andric Buffer.append("HANGUL SYLLABLE "); 30681ad6265SDimitry Andric if (L != -1) 30781ad6265SDimitry Andric Buffer.append(HangulSyllables[L][0]); 30881ad6265SDimitry Andric if (V != -1) 30981ad6265SDimitry Andric Buffer.append(HangulSyllables[V][1]); 31081ad6265SDimitry Andric if (T != -1) 31181ad6265SDimitry Andric Buffer.append(HangulSyllables[T][2]); 31281ad6265SDimitry Andric } 31381ad6265SDimitry Andric return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount + 31481ad6265SDimitry Andric std::uint32_t(T); 31581ad6265SDimitry Andric } 31681ad6265SDimitry Andric // Otherwise, it's an illegal syllable name. 317*bdd1243dSDimitry Andric return std::nullopt; 31881ad6265SDimitry Andric } 31981ad6265SDimitry Andric 32081ad6265SDimitry Andric struct GeneratedNamesData { 32181ad6265SDimitry Andric StringRef Prefix; 32281ad6265SDimitry Andric uint32_t Start; 32381ad6265SDimitry Andric uint32_t End; 32481ad6265SDimitry Andric }; 32581ad6265SDimitry Andric 326*bdd1243dSDimitry Andric // Unicode 15.0 Table 4-8. Name Derivation Rule Prefix Strings 32781ad6265SDimitry Andric static const GeneratedNamesData GeneratedNamesDataTable[] = { 32881ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF}, 329*bdd1243dSDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF}, 330*bdd1243dSDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF}, 331*bdd1243dSDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739}, 33281ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D}, 33381ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1}, 33481ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0}, 33581ad6265SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A}, 336*bdd1243dSDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF}, 33781ad6265SDimitry Andric {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7}, 33881ad6265SDimitry Andric {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08}, 33981ad6265SDimitry Andric {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5}, 34081ad6265SDimitry Andric {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB}, 34181ad6265SDimitry Andric {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D}, 34281ad6265SDimitry Andric {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9}, 34381ad6265SDimitry Andric {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D}, 34481ad6265SDimitry Andric }; 34581ad6265SDimitry Andric 346*bdd1243dSDimitry Andric static std::optional<char32_t> 34781ad6265SDimitry Andric nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) { 34881ad6265SDimitry Andric for (auto &&Item : GeneratedNamesDataTable) { 34981ad6265SDimitry Andric Buffer.clear(); 35081ad6265SDimitry Andric std::size_t Consummed = 0; 35181ad6265SDimitry Andric char NameStart = 0, NeedleStart = 0; 35281ad6265SDimitry Andric bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed, 35381ad6265SDimitry Andric NameStart, NeedleStart, /*isPrefix*/ true); 35481ad6265SDimitry Andric if (!DoesStartWith) 35581ad6265SDimitry Andric continue; 35681ad6265SDimitry Andric auto Number = Name.substr(Consummed); 35781ad6265SDimitry Andric unsigned long long V = 0; 35881ad6265SDimitry Andric // Be consistent about mandating upper casing. 35981ad6265SDimitry Andric if (Strict && 36081ad6265SDimitry Andric llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; })) 36181ad6265SDimitry Andric return {}; 36281ad6265SDimitry Andric if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End) 36381ad6265SDimitry Andric continue; 36481ad6265SDimitry Andric if (!Strict) { 36581ad6265SDimitry Andric Buffer.append(Item.Prefix); 36681ad6265SDimitry Andric Buffer.append(utohexstr(V, true)); 36781ad6265SDimitry Andric } 36881ad6265SDimitry Andric return V; 36981ad6265SDimitry Andric } 370*bdd1243dSDimitry Andric return std::nullopt; 37181ad6265SDimitry Andric } 37281ad6265SDimitry Andric 373*bdd1243dSDimitry Andric static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict, 37481ad6265SDimitry Andric BufferType &Buffer) { 37581ad6265SDimitry Andric if (Name.empty()) 376*bdd1243dSDimitry Andric return std::nullopt; 37781ad6265SDimitry Andric 378*bdd1243dSDimitry Andric std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer); 37981ad6265SDimitry Andric if (!Res) 38081ad6265SDimitry Andric Res = nameToGeneratedCodePoint(Name, Strict, Buffer); 38181ad6265SDimitry Andric if (Res) 38281ad6265SDimitry Andric return *Res; 38381ad6265SDimitry Andric 38481ad6265SDimitry Andric Buffer.clear(); 38581ad6265SDimitry Andric Node Node; 38681ad6265SDimitry Andric bool Matches; 38781ad6265SDimitry Andric uint32_t Value; 38881ad6265SDimitry Andric std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer); 38981ad6265SDimitry Andric if (Matches) { 39081ad6265SDimitry Andric std::reverse(Buffer.begin(), Buffer.end()); 39181ad6265SDimitry Andric // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial 39281ad6265SDimitry Andric // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E. 39381ad6265SDimitry Andric if (!Strict && Value == 0x116c && 39481ad6265SDimitry Andric Name.find_insensitive("O-E") != StringRef::npos) { 39581ad6265SDimitry Andric Buffer = "HANGUL JUNGSEONG O-E"; 39681ad6265SDimitry Andric Value = 0x1180; 39781ad6265SDimitry Andric } 39881ad6265SDimitry Andric return Value; 39981ad6265SDimitry Andric } 400*bdd1243dSDimitry Andric return std::nullopt; 40181ad6265SDimitry Andric } 40281ad6265SDimitry Andric 403*bdd1243dSDimitry Andric std::optional<char32_t> nameToCodepointStrict(StringRef Name) { 40481ad6265SDimitry Andric 40581ad6265SDimitry Andric BufferType Buffer; 40681ad6265SDimitry Andric auto Opt = nameToCodepoint(Name, true, Buffer); 40781ad6265SDimitry Andric return Opt; 40881ad6265SDimitry Andric } 40981ad6265SDimitry Andric 410*bdd1243dSDimitry Andric std::optional<LooseMatchingResult> 41181ad6265SDimitry Andric nameToCodepointLooseMatching(StringRef Name) { 41281ad6265SDimitry Andric BufferType Buffer; 41381ad6265SDimitry Andric auto Opt = nameToCodepoint(Name, false, Buffer); 41481ad6265SDimitry Andric if (!Opt) 415*bdd1243dSDimitry Andric return std::nullopt; 41681ad6265SDimitry Andric return LooseMatchingResult{*Opt, Buffer}; 41781ad6265SDimitry Andric } 41881ad6265SDimitry Andric 41981ad6265SDimitry Andric // Find the unicode character whose editing distance to Pattern 42081ad6265SDimitry Andric // is shortest, using the Wagner–Fischer algorithm. 42181ad6265SDimitry Andric llvm::SmallVector<MatchForCodepointName> 42281ad6265SDimitry Andric nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) { 42381ad6265SDimitry Andric // We maintain a fixed size vector of matches, 42481ad6265SDimitry Andric // sorted by distance 42581ad6265SDimitry Andric // The worst match (with the biggest distance) are discarded when new elements 42681ad6265SDimitry Andric // are added. 42781ad6265SDimitry Andric std::size_t LargestEditDistance = 0; 42881ad6265SDimitry Andric llvm::SmallVector<MatchForCodepointName> Matches; 42981ad6265SDimitry Andric Matches.reserve(MaxMatchesCount + 1); 43081ad6265SDimitry Andric 43181ad6265SDimitry Andric auto Insert = [&](const Node &Node, uint32_t Distance, 43281ad6265SDimitry Andric char32_t Value) -> bool { 43381ad6265SDimitry Andric if (Distance > LargestEditDistance) { 43481ad6265SDimitry Andric if (Matches.size() == MaxMatchesCount) 43581ad6265SDimitry Andric return false; 43681ad6265SDimitry Andric LargestEditDistance = Distance; 43781ad6265SDimitry Andric } 43881ad6265SDimitry Andric // To avoid allocations, the creation of the name is delayed 43981ad6265SDimitry Andric // as much as possible. 44081ad6265SDimitry Andric std::string Name; 44181ad6265SDimitry Andric auto GetName = [&] { 44281ad6265SDimitry Andric if (Name.empty()) 44381ad6265SDimitry Andric Name = Node.fullName(); 44481ad6265SDimitry Andric return Name; 44581ad6265SDimitry Andric }; 44681ad6265SDimitry Andric 447*bdd1243dSDimitry Andric auto It = llvm::lower_bound( 448*bdd1243dSDimitry Andric Matches, Distance, 44981ad6265SDimitry Andric [&](const MatchForCodepointName &a, std::size_t Distance) { 45081ad6265SDimitry Andric if (Distance == a.Distance) 45181ad6265SDimitry Andric return a.Name < GetName(); 45281ad6265SDimitry Andric return a.Distance < Distance; 45381ad6265SDimitry Andric }); 45481ad6265SDimitry Andric if (It == Matches.end() && Matches.size() == MaxMatchesCount) 45581ad6265SDimitry Andric return false; 45681ad6265SDimitry Andric 45781ad6265SDimitry Andric MatchForCodepointName M{GetName(), Distance, Value}; 45881ad6265SDimitry Andric Matches.insert(It, std::move(M)); 45981ad6265SDimitry Andric if (Matches.size() > MaxMatchesCount) 46081ad6265SDimitry Andric Matches.pop_back(); 46181ad6265SDimitry Andric return true; 46281ad6265SDimitry Andric }; 46381ad6265SDimitry Andric 46481ad6265SDimitry Andric // We ignore case, space, hyphens, etc, 46581ad6265SDimitry Andric // in both the search pattern and the prospective names. 46681ad6265SDimitry Andric auto Normalize = [](StringRef Name) { 46781ad6265SDimitry Andric std::string Out; 46881ad6265SDimitry Andric Out.reserve(Name.size()); 46981ad6265SDimitry Andric for (char C : Name) { 47081ad6265SDimitry Andric if (isAlnum(C)) 47181ad6265SDimitry Andric Out.push_back(toUpper(C)); 47281ad6265SDimitry Andric } 47381ad6265SDimitry Andric return Out; 47481ad6265SDimitry Andric }; 47581ad6265SDimitry Andric std::string NormalizedName = Normalize(Pattern); 47681ad6265SDimitry Andric 47781ad6265SDimitry Andric // Allocate a matrix big enough for longest names. 47881ad6265SDimitry Andric const std::size_t Columns = 47981ad6265SDimitry Andric std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) + 48081ad6265SDimitry Andric 1; 48181ad6265SDimitry Andric 48281ad6265SDimitry Andric LLVM_ATTRIBUTE_UNUSED static std::size_t Rows = 48381ad6265SDimitry Andric UnicodeNameToCodepointLargestNameSize + 1; 48481ad6265SDimitry Andric 48581ad6265SDimitry Andric std::vector<char> Distances( 48681ad6265SDimitry Andric Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0); 48781ad6265SDimitry Andric 48881ad6265SDimitry Andric auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & { 48981ad6265SDimitry Andric assert(Column < Columns); 49081ad6265SDimitry Andric assert(Row < Rows); 49181ad6265SDimitry Andric return Distances[Row * Columns + Column]; 49281ad6265SDimitry Andric }; 49381ad6265SDimitry Andric 49481ad6265SDimitry Andric for (std::size_t I = 0; I < Columns; I++) 49581ad6265SDimitry Andric Get(I, 0) = I; 49681ad6265SDimitry Andric 49781ad6265SDimitry Andric // Visit the childrens, 49881ad6265SDimitry Andric // Filling (and overriding) the matrix for the name fragment of each node 49981ad6265SDimitry Andric // iteratively. CompleteName is used to collect the actual name of potential 50081ad6265SDimitry Andric // match, respecting case and spacing. 50181ad6265SDimitry Andric auto VisitNode = [&](const Node &N, std::size_t Row, 50281ad6265SDimitry Andric auto &VisitNode) -> void { 50381ad6265SDimitry Andric std::size_t J = 0; 50481ad6265SDimitry Andric for (; J < N.Name.size(); J++) { 50581ad6265SDimitry Andric if (!isAlnum(N.Name[J])) 50681ad6265SDimitry Andric continue; 50781ad6265SDimitry Andric 50881ad6265SDimitry Andric Get(0, Row) = Row; 50981ad6265SDimitry Andric 51081ad6265SDimitry Andric for (std::size_t I = 1; I < Columns; I++) { 51181ad6265SDimitry Andric const int Delete = Get(I - 1, Row) + 1; 51281ad6265SDimitry Andric const int Insert = Get(I, Row - 1) + 1; 51381ad6265SDimitry Andric 51481ad6265SDimitry Andric const int Replace = 51581ad6265SDimitry Andric Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0); 51681ad6265SDimitry Andric 51781ad6265SDimitry Andric Get(I, Row) = std::min(Insert, std::min(Delete, Replace)); 51881ad6265SDimitry Andric } 51981ad6265SDimitry Andric 52081ad6265SDimitry Andric Row++; 52181ad6265SDimitry Andric } 52281ad6265SDimitry Andric 52381ad6265SDimitry Andric unsigned Cost = Get(Columns - 1, Row - 1); 52481ad6265SDimitry Andric if (N.Value != 0xFFFFFFFF) { 52581ad6265SDimitry Andric Insert(N, Cost, N.Value); 52681ad6265SDimitry Andric } 52781ad6265SDimitry Andric 52881ad6265SDimitry Andric if (N.hasChildren()) { 52981ad6265SDimitry Andric auto ChildOffset = N.ChildrenOffset; 53081ad6265SDimitry Andric for (;;) { 53181ad6265SDimitry Andric Node C = readNode(ChildOffset, &N); 53281ad6265SDimitry Andric ChildOffset += C.Size; 53381ad6265SDimitry Andric if (!C.isValid()) 53481ad6265SDimitry Andric break; 53581ad6265SDimitry Andric VisitNode(C, Row, VisitNode); 53681ad6265SDimitry Andric if (!C.HasSibling) 53781ad6265SDimitry Andric break; 53881ad6265SDimitry Andric } 53981ad6265SDimitry Andric } 54081ad6265SDimitry Andric }; 54181ad6265SDimitry Andric 54281ad6265SDimitry Andric Node Root = createRoot(); 54381ad6265SDimitry Andric VisitNode(Root, 1, VisitNode); 54481ad6265SDimitry Andric return Matches; 54581ad6265SDimitry Andric } 54681ad6265SDimitry Andric 54781ad6265SDimitry Andric } // namespace unicode 54881ad6265SDimitry Andric 54981ad6265SDimitry Andric } // namespace sys 55081ad6265SDimitry Andric } // namespace llvm 551