xref: /openbsd-src/gnu/llvm/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp (revision d415bd752c734aee168c4ee86ff32e8cc249eb16)
1*d415bd75Srobert //===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
2*d415bd75Srobert //
3*d415bd75Srobert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*d415bd75Srobert // See https://llvm.org/LICENSE.txt for license information.
5*d415bd75Srobert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*d415bd75Srobert //
7*d415bd75Srobert //===----------------------------------------------------------------------===//
8*d415bd75Srobert //
9*d415bd75Srobert // This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
10*d415bd75Srobert // using UnicodeData.txt and NameAliases.txt available at
11*d415bd75Srobert // https://unicode.org/Public/15.0.0/ucd/
12*d415bd75Srobert //===----------------------------------------------------------------------===//
13*d415bd75Srobert 
14*d415bd75Srobert #include "llvm/ADT/STLExtras.h"
15*d415bd75Srobert #include "llvm/ADT/StringExtras.h"
16*d415bd75Srobert #include "llvm/ADT/StringRef.h"
17*d415bd75Srobert #include <algorithm>
18*d415bd75Srobert #include <array>
19*d415bd75Srobert #include <deque>
20*d415bd75Srobert #include <fstream>
21*d415bd75Srobert #include <memory>
22*d415bd75Srobert #include <optional>
23*d415bd75Srobert #include <set>
24*d415bd75Srobert #include <string>
25*d415bd75Srobert #include <unordered_map>
26*d415bd75Srobert #include <utility>
27*d415bd75Srobert #include <vector>
28*d415bd75Srobert 
29*d415bd75Srobert static const llvm::StringRef Letters =
30*d415bd75Srobert     " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
31*d415bd75Srobert 
32*d415bd75Srobert // Collect names UnicodeData.txt and AliasNames.txt
33*d415bd75Srobert // There may be multiple names per code points.
34*d415bd75Srobert static std::unordered_multimap<char32_t, std::string>
loadDataFiles(const std::string & NamesFile,const std::string & AliasesFile)35*d415bd75Srobert loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
36*d415bd75Srobert   std::unordered_multimap<char32_t, std::string> CollectedCharacters;
37*d415bd75Srobert   auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
38*d415bd75Srobert     std::ifstream InputFile(File);
39*d415bd75Srobert     for (std::string Line; getline(InputFile, Line);) {
40*d415bd75Srobert       if (Line.empty() || !isxdigit(Line[0]))
41*d415bd75Srobert         continue;
42*d415bd75Srobert       auto FirstSemiPos = Line.find(';');
43*d415bd75Srobert       if (FirstSemiPos == std::string::npos)
44*d415bd75Srobert         continue;
45*d415bd75Srobert       auto SecondSemiPos = Line.find(';', FirstSemiPos + 1);
46*d415bd75Srobert       if (SecondSemiPos == std::string::npos)
47*d415bd75Srobert         continue;
48*d415bd75Srobert       unsigned long long CodePoint;
49*d415bd75Srobert       if (llvm::getAsUnsignedInteger(
50*d415bd75Srobert               llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) {
51*d415bd75Srobert         continue;
52*d415bd75Srobert       }
53*d415bd75Srobert 
54*d415bd75Srobert       std::string Name =
55*d415bd75Srobert           Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1);
56*d415bd75Srobert 
57*d415bd75Srobert       if (!Name.empty() && Name[0] == '<') {
58*d415bd75Srobert         // Ignore ranges of characters, as their name is either absent or
59*d415bd75Srobert         // generated.
60*d415bd75Srobert         continue;
61*d415bd75Srobert       }
62*d415bd75Srobert 
63*d415bd75Srobert       // Some aliases are ignored for compatibility with C++
64*d415bd75Srobert       if (IsAliasFile) {
65*d415bd75Srobert         std::string Kind = Line.substr(SecondSemiPos + 1);
66*d415bd75Srobert         if (Kind != "control" && Kind != "correction" && Kind != "alternate")
67*d415bd75Srobert           continue;
68*d415bd75Srobert       }
69*d415bd75Srobert 
70*d415bd75Srobert       auto InsertUnique = [&](char32_t CP, std::string Name) {
71*d415bd75Srobert         auto It = CollectedCharacters.find(CP);
72*d415bd75Srobert         while (It != std::end(CollectedCharacters) && It->first == CP) {
73*d415bd75Srobert           if (It->second == Name)
74*d415bd75Srobert             return;
75*d415bd75Srobert           ++It;
76*d415bd75Srobert         }
77*d415bd75Srobert         CollectedCharacters.insert({CP, std::move(Name)});
78*d415bd75Srobert       };
79*d415bd75Srobert       InsertUnique(CodePoint, std::move(Name));
80*d415bd75Srobert     }
81*d415bd75Srobert   };
82*d415bd75Srobert 
83*d415bd75Srobert   FromFile(NamesFile);
84*d415bd75Srobert   FromFile(AliasesFile, true);
85*d415bd75Srobert   return CollectedCharacters;
86*d415bd75Srobert }
87*d415bd75Srobert 
88*d415bd75Srobert class Trie {
89*d415bd75Srobert   struct Node;
90*d415bd75Srobert 
91*d415bd75Srobert public:
92*d415bd75Srobert   // When inserting named codepoint
93*d415bd75Srobert   // We create a node per character in the name.
94*d415bd75Srobert   // SPARKLE becomes S <- P <- A <- R <- K <- L <- E
95*d415bd75Srobert   // Once all  characters are inserted, the tree is compacted
insert(llvm::StringRef Name,char32_t Codepoint)96*d415bd75Srobert   void insert(llvm::StringRef Name, char32_t Codepoint) {
97*d415bd75Srobert     Node *N = Root.get();
98*d415bd75Srobert     for (auto Ch : Name) {
99*d415bd75Srobert       std::string Label(1, Ch);
100*d415bd75Srobert       auto It = llvm::find_if(N->Children,
101*d415bd75Srobert                               [&](const auto &C) { return C->Name == Label; });
102*d415bd75Srobert       if (It == N->Children.end()) {
103*d415bd75Srobert         It = N->Children.insert(It, std::make_unique<Node>(Label, N));
104*d415bd75Srobert       }
105*d415bd75Srobert       N = It->get();
106*d415bd75Srobert     }
107*d415bd75Srobert     N->Value = Codepoint;
108*d415bd75Srobert   }
109*d415bd75Srobert 
compact()110*d415bd75Srobert   void compact() { compact(Root.get()); }
111*d415bd75Srobert 
112*d415bd75Srobert   // This creates 2 arrays of bytes from the tree:
113*d415bd75Srobert   // A serialized dictionary of node labels,
114*d415bd75Srobert   // And the nodes themselves.
115*d415bd75Srobert   // The name of each label is found by indexing into the dictionary.
116*d415bd75Srobert   // The longest names are inserted first into the dictionary,
117*d415bd75Srobert   // in the hope it will contain shorter labels as substring,
118*d415bd75Srobert   // thereby reducing duplication.
119*d415bd75Srobert   // We could theorically be more clever by trying to minimizing the size
120*d415bd75Srobert   // of the dictionary.
serialize()121*d415bd75Srobert   std::pair<std::string, std::vector<uint8_t>> serialize() {
122*d415bd75Srobert     std::set<std::string> Names = this->getNameFragments();
123*d415bd75Srobert     std::vector<std::string> Sorted(Names.begin(), Names.end());
124*d415bd75Srobert     llvm::sort(Sorted, [](const auto &a, const auto &b) {
125*d415bd75Srobert       return a.size() > b.size();
126*d415bd75Srobert     });
127*d415bd75Srobert     std::string Dict(Letters.begin(), Letters.end());
128*d415bd75Srobert     Dict.reserve(50000);
129*d415bd75Srobert     for (const std::string &Name : Sorted) {
130*d415bd75Srobert       if (Name.size() <= 1)
131*d415bd75Srobert         continue;
132*d415bd75Srobert       if (Dict.find(Name) != std::string::npos)
133*d415bd75Srobert         continue;
134*d415bd75Srobert       Dict += Name;
135*d415bd75Srobert     }
136*d415bd75Srobert 
137*d415bd75Srobert     if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
138*d415bd75Srobert       fprintf(stderr, "Dictionary too big  to be serialized");
139*d415bd75Srobert       exit(1);
140*d415bd75Srobert     }
141*d415bd75Srobert 
142*d415bd75Srobert     auto Bytes = dumpIndex(Dict);
143*d415bd75Srobert     return {Dict, Bytes};
144*d415bd75Srobert   }
145*d415bd75Srobert 
getNameFragments()146*d415bd75Srobert   std::set<std::string> getNameFragments() {
147*d415bd75Srobert     std::set<std::string> Keys;
148*d415bd75Srobert     collectKeys(Root.get(), Keys);
149*d415bd75Srobert     return Keys;
150*d415bd75Srobert   }
151*d415bd75Srobert 
152*d415bd75Srobert   // Maps a valid char in an Unicode character name
153*d415bd75Srobert   // To a 6 bits index.
letter(char C)154*d415bd75Srobert   static uint8_t letter(char C) {
155*d415bd75Srobert     auto Pos = Letters.find(C);
156*d415bd75Srobert     assert(Pos != std::string::npos &&
157*d415bd75Srobert            "Invalid letter in Unicode character name");
158*d415bd75Srobert     return Pos;
159*d415bd75Srobert   }
160*d415bd75Srobert 
161*d415bd75Srobert   // clang-format off
162*d415bd75Srobert   // +================+============+======================+=============+========+===+==============+===============+
163*d415bd75Srobert   // | 0          | 1             | 2-7 (6)              | 8-23        | 24-44  |    | 46           | 47            |
164*d415bd75Srobert   // +================+============+======================+=============+========+===+==============+===============+
165*d415bd75Srobert   // | Has Value |  Has Long Name | Letter OR Name Size  | Dict Index  | Value  |    | Has Sibling  | Has Children  |
166*d415bd75Srobert   // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
167*d415bd75Srobert   // clang-format on
168*d415bd75Srobert 
dumpIndex(const std::string & Dict)169*d415bd75Srobert   std::vector<uint8_t> dumpIndex(const std::string &Dict) {
170*d415bd75Srobert     struct ChildrenOffset {
171*d415bd75Srobert       Node *FirstChild;
172*d415bd75Srobert       std::size_t Offset;
173*d415bd75Srobert       bool HasValue;
174*d415bd75Srobert     };
175*d415bd75Srobert 
176*d415bd75Srobert     // Keep track of the start of each node
177*d415bd75Srobert     // position in the serialized data.
178*d415bd75Srobert     std::unordered_map<Node *, int32_t> Offsets;
179*d415bd75Srobert 
180*d415bd75Srobert     // Keep track of where to write the index
181*d415bd75Srobert     // of the first children
182*d415bd75Srobert     std::vector<ChildrenOffset> ChildrenOffsets;
183*d415bd75Srobert     std::unordered_map<Node *, bool> SiblingTracker;
184*d415bd75Srobert     std::deque<Node *> AllNodes;
185*d415bd75Srobert     std::vector<uint8_t> Bytes;
186*d415bd75Srobert     Bytes.reserve(250'000);
187*d415bd75Srobert     // This leading byte is used by the reading code to detect the root node.
188*d415bd75Srobert     Bytes.push_back(0);
189*d415bd75Srobert 
190*d415bd75Srobert     auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
191*d415bd75Srobert       for (std::size_t Index = 0; Index < Children.size(); Index++) {
192*d415bd75Srobert         const std::unique_ptr<Node> &Child = Children[Index];
193*d415bd75Srobert         AllNodes.push_back(Child.get());
194*d415bd75Srobert         if (Index != Children.size() - 1)
195*d415bd75Srobert           SiblingTracker[Child.get()] = true;
196*d415bd75Srobert       }
197*d415bd75Srobert     };
198*d415bd75Srobert     CollectChildren(Root->Children);
199*d415bd75Srobert 
200*d415bd75Srobert     while (!AllNodes.empty()) {
201*d415bd75Srobert       const std::size_t Offset = Bytes.size();
202*d415bd75Srobert       Node *const N = AllNodes.front();
203*d415bd75Srobert       AllNodes.pop_front();
204*d415bd75Srobert 
205*d415bd75Srobert       assert(!N->Name.empty());
206*d415bd75Srobert       Offsets[N] = Offset;
207*d415bd75Srobert 
208*d415bd75Srobert       uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
209*d415bd75Srobert       // Single letter node are indexed in 6 bits
210*d415bd75Srobert       if (N->Name.size() == 1) {
211*d415bd75Srobert         FirstByte |= letter(N->Name[0]);
212*d415bd75Srobert         Bytes.push_back(FirstByte);
213*d415bd75Srobert       } else {
214*d415bd75Srobert         // Otherwise we use a 16 bits index
215*d415bd75Srobert         FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
216*d415bd75Srobert         Bytes.push_back(FirstByte);
217*d415bd75Srobert         auto PosInDict = Dict.find(N->Name);
218*d415bd75Srobert         assert(PosInDict != std::string::npos);
219*d415bd75Srobert         uint8_t Low = PosInDict;
220*d415bd75Srobert         uint8_t High = ((PosInDict >> 8) & 0xFF);
221*d415bd75Srobert         Bytes.push_back(High);
222*d415bd75Srobert         Bytes.push_back(Low);
223*d415bd75Srobert       }
224*d415bd75Srobert 
225*d415bd75Srobert       const bool HasSibling = SiblingTracker.count(N) != 0;
226*d415bd75Srobert       const bool HasChildren = N->Children.size() != 0;
227*d415bd75Srobert 
228*d415bd75Srobert       if (!!N->Value) {
229*d415bd75Srobert         uint32_t Value = (*(N->Value) << 3);
230*d415bd75Srobert         uint8_t H = ((Value >> 16) & 0xFF);
231*d415bd75Srobert         uint8_t M = ((Value >> 8) & 0xFF);
232*d415bd75Srobert         uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
233*d415bd75Srobert                     uint8_t(HasChildren ? 0x02 : 0);
234*d415bd75Srobert 
235*d415bd75Srobert         Bytes.push_back(H);
236*d415bd75Srobert         Bytes.push_back(M);
237*d415bd75Srobert         Bytes.push_back(L);
238*d415bd75Srobert 
239*d415bd75Srobert         if (HasChildren) {
240*d415bd75Srobert           ChildrenOffsets.push_back(
241*d415bd75Srobert               ChildrenOffset{N->Children[0].get(), Bytes.size(), true});
242*d415bd75Srobert           // index of the first children
243*d415bd75Srobert           Bytes.push_back(0x00);
244*d415bd75Srobert           Bytes.push_back(0x00);
245*d415bd75Srobert           Bytes.push_back(0x00);
246*d415bd75Srobert         }
247*d415bd75Srobert       } else {
248*d415bd75Srobert         // When there is no value (that's most intermediate nodes)
249*d415bd75Srobert         // Dispense of the 3 values bytes, and only store
250*d415bd75Srobert         // 1 byte to track whether the node has sibling and children
251*d415bd75Srobert         // + 2 bytes for the index of the first children if necessary.
252*d415bd75Srobert         // That index also uses bytes 0-6 of the previous byte.
253*d415bd75Srobert         uint8_t Byte =
254*d415bd75Srobert             uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
255*d415bd75Srobert         Bytes.push_back(Byte);
256*d415bd75Srobert         if (HasChildren) {
257*d415bd75Srobert           ChildrenOffsets.emplace_back(
258*d415bd75Srobert               ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false});
259*d415bd75Srobert           Bytes.push_back(0x00);
260*d415bd75Srobert           Bytes.push_back(0x00);
261*d415bd75Srobert         }
262*d415bd75Srobert       }
263*d415bd75Srobert       CollectChildren(N->Children);
264*d415bd75Srobert     }
265*d415bd75Srobert 
266*d415bd75Srobert     // Once all the nodes are in the inndex
267*d415bd75Srobert     // Fill the bytes we left to indicate the position
268*d415bd75Srobert     // of the children
269*d415bd75Srobert     for (const ChildrenOffset &Parent : ChildrenOffsets) {
270*d415bd75Srobert       const auto It = Offsets.find(Parent.FirstChild);
271*d415bd75Srobert       assert(It != Offsets.end());
272*d415bd75Srobert       std::size_t Pos = It->second;
273*d415bd75Srobert       if (Parent.HasValue) {
274*d415bd75Srobert         Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
275*d415bd75Srobert       } else {
276*d415bd75Srobert         Bytes[Parent.Offset] =
277*d415bd75Srobert             Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
278*d415bd75Srobert       }
279*d415bd75Srobert       Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
280*d415bd75Srobert       Bytes[Parent.Offset + 2] = Pos & 0xFF;
281*d415bd75Srobert     }
282*d415bd75Srobert 
283*d415bd75Srobert     // Add some padding so that the deserialization code
284*d415bd75Srobert     // doesn't try to read past the enf of the array.
285*d415bd75Srobert     Bytes.push_back(0);
286*d415bd75Srobert     Bytes.push_back(0);
287*d415bd75Srobert     Bytes.push_back(0);
288*d415bd75Srobert     Bytes.push_back(0);
289*d415bd75Srobert     Bytes.push_back(0);
290*d415bd75Srobert     Bytes.push_back(0);
291*d415bd75Srobert 
292*d415bd75Srobert     return Bytes;
293*d415bd75Srobert   }
294*d415bd75Srobert 
295*d415bd75Srobert private:
collectKeys(Node * N,std::set<std::string> & Keys)296*d415bd75Srobert   void collectKeys(Node *N, std::set<std::string> &Keys) {
297*d415bd75Srobert     Keys.insert(N->Name);
298*d415bd75Srobert     for (const std::unique_ptr<Node> &Child : N->Children) {
299*d415bd75Srobert       collectKeys(Child.get(), Keys);
300*d415bd75Srobert     }
301*d415bd75Srobert   }
302*d415bd75Srobert 
303*d415bd75Srobert   // Merge sequences of 1-character nodes
304*d415bd75Srobert   // This greatly reduce the total number of nodes,
305*d415bd75Srobert   // and therefore the size of the index.
306*d415bd75Srobert   // When the tree gets serialized, we only have 5 bytes to store the
307*d415bd75Srobert   // size of a name. Overlong names (>32 characters) are therefore
308*d415bd75Srobert   // kep into separate nodes
compact(Node * N)309*d415bd75Srobert   void compact(Node *N) {
310*d415bd75Srobert     for (auto &&Child : N->Children) {
311*d415bd75Srobert       compact(Child.get());
312*d415bd75Srobert     }
313*d415bd75Srobert     if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
314*d415bd75Srobert         (N->Parent->Name.size() + N->Name.size() <= 32)) {
315*d415bd75Srobert       N->Parent->Value = N->Value;
316*d415bd75Srobert       N->Parent->Name += N->Name;
317*d415bd75Srobert       N->Parent->Children = std::move(N->Children);
318*d415bd75Srobert       for (std::unique_ptr<Node> &c : N->Parent->Children) {
319*d415bd75Srobert         c->Parent = N->Parent;
320*d415bd75Srobert       }
321*d415bd75Srobert     }
322*d415bd75Srobert   }
323*d415bd75Srobert   struct Node {
NodeTrie::Node324*d415bd75Srobert     Node(std::string Name, Node *Parent = nullptr)
325*d415bd75Srobert         : Name(Name), Parent(Parent) {}
326*d415bd75Srobert 
327*d415bd75Srobert     std::vector<std::unique_ptr<Node>> Children;
328*d415bd75Srobert     std::string Name;
329*d415bd75Srobert     Node *Parent = nullptr;
330*d415bd75Srobert     std::optional<char32_t> Value;
331*d415bd75Srobert   };
332*d415bd75Srobert 
333*d415bd75Srobert   std::unique_ptr<Node> Root = std::make_unique<Node>("");
334*d415bd75Srobert };
335*d415bd75Srobert 
336*d415bd75Srobert extern const char *UnicodeLicense;
337*d415bd75Srobert 
main(int argc,char ** argv)338*d415bd75Srobert int main(int argc, char **argv) {
339*d415bd75Srobert   printf("Unicode name -> codepoint mapping generator\n"
340*d415bd75Srobert          "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
341*d415bd75Srobert          argv[0]);
342*d415bd75Srobert   printf("NameAliases.txt can be found at "
343*d415bd75Srobert          "https://unicode.org/Public/15.0.0/ucd/NameAliases.txt\n"
344*d415bd75Srobert          "UnicodeData.txt can be found at "
345*d415bd75Srobert          "https://unicode.org/Public/15.0.0/ucd/UnicodeData.txt\n\n");
346*d415bd75Srobert 
347*d415bd75Srobert   if (argc != 4)
348*d415bd75Srobert     return EXIT_FAILURE;
349*d415bd75Srobert 
350*d415bd75Srobert   FILE *Out = fopen(argv[3], "w");
351*d415bd75Srobert   if (!Out) {
352*d415bd75Srobert     printf("Error creating output file.\n");
353*d415bd75Srobert     return EXIT_FAILURE;
354*d415bd75Srobert   }
355*d415bd75Srobert 
356*d415bd75Srobert   Trie T;
357*d415bd75Srobert   uint32_t NameCount = 0;
358*d415bd75Srobert   std::size_t LongestName = 0;
359*d415bd75Srobert   auto Entries = loadDataFiles(argv[1], argv[2]);
360*d415bd75Srobert   for (const std::pair<const char32_t, std::string> &Entry : Entries) {
361*d415bd75Srobert     char32_t Codepoint = Entry.first;
362*d415bd75Srobert     const std::string &Name = Entry.second;
363*d415bd75Srobert     // Ignore names which are not valid.
364*d415bd75Srobert     if (Name.empty() || !llvm::all_of(Name, [](char C) {
365*d415bd75Srobert           return llvm::is_contained(Letters, C);
366*d415bd75Srobert         })) {
367*d415bd75Srobert       continue;
368*d415bd75Srobert     }
369*d415bd75Srobert     printf("%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
370*d415bd75Srobert     T.insert(Name, Codepoint);
371*d415bd75Srobert     LongestName =
372*d415bd75Srobert         std::max(LongestName, std::size_t(llvm::count_if(Name, llvm::isAlnum)));
373*d415bd75Srobert     NameCount++;
374*d415bd75Srobert   }
375*d415bd75Srobert   T.compact();
376*d415bd75Srobert 
377*d415bd75Srobert   std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
378*d415bd75Srobert   const std::string &Dict = Data.first;
379*d415bd75Srobert   const std::vector<uint8_t> &Tree = Data.second;
380*d415bd75Srobert 
381*d415bd75Srobert   fprintf(Out, R"(
382*d415bd75Srobert //===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
383*d415bd75Srobert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
384*d415bd75Srobert // See https://llvm.org/LICENSE.txt for license information.
385*d415bd75Srobert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
386*d415bd75Srobert //
387*d415bd75Srobert //===----------------------------------------------------------------------===//
388*d415bd75Srobert //
389*d415bd75Srobert // This file implements mapping the name of a unicode code point to its value.
390*d415bd75Srobert //
391*d415bd75Srobert // This file was generated using %s.
392*d415bd75Srobert // Do not edit manually.
393*d415bd75Srobert //
394*d415bd75Srobert //===----------------------------------------------------------------------===//
395*d415bd75Srobert %s
396*d415bd75Srobert 
397*d415bd75Srobert 
398*d415bd75Srobert 
399*d415bd75Srobert #include "llvm/Support/Compiler.h"
400*d415bd75Srobert #include <cstddef>
401*d415bd75Srobert #include <cstdint>
402*d415bd75Srobert )",
403*d415bd75Srobert           argv[0], UnicodeLicense);
404*d415bd75Srobert 
405*d415bd75Srobert   fprintf(Out,
406*d415bd75Srobert           "namespace llvm { namespace sys { namespace unicode { \n"
407*d415bd75Srobert           "extern const char *UnicodeNameToCodepointDict;\n"
408*d415bd75Srobert           "extern const uint8_t *UnicodeNameToCodepointIndex;\n"
409*d415bd75Srobert           "extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
410*d415bd75Srobert           "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
411*d415bd75Srobert 
412*d415bd75Srobert   fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n",
413*d415bd75Srobert           Dict.c_str());
414*d415bd75Srobert 
415*d415bd75Srobert   fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
416*d415bd75Srobert           Tree.size() + 1);
417*d415bd75Srobert 
418*d415bd75Srobert   for (auto Byte : Tree) {
419*d415bd75Srobert     fprintf(Out, "0x%02x,", Byte);
420*d415bd75Srobert   }
421*d415bd75Srobert 
422*d415bd75Srobert   fprintf(Out, "0};");
423*d415bd75Srobert   fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = "
424*d415bd75Srobert                "UnicodeNameToCodepointIndex_; \n");
425*d415bd75Srobert   fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
426*d415bd75Srobert           Tree.size() + 1);
427*d415bd75Srobert   fprintf(Out,
428*d415bd75Srobert           "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
429*d415bd75Srobert           LongestName);
430*d415bd75Srobert   fprintf(Out, "\n}}}\n");
431*d415bd75Srobert   fclose(Out);
432*d415bd75Srobert   printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
433*d415bd75Srobert          argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
434*d415bd75Srobert }
435*d415bd75Srobert 
436*d415bd75Srobert const char *UnicodeLicense = R"(
437*d415bd75Srobert /*
438*d415bd75Srobert UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
439*d415bd75Srobert 
440*d415bd75Srobert See Terms of Use <https://www.unicode.org/copyright.html>
441*d415bd75Srobert for definitions of Unicode Inc.’s Data Files and Software.
442*d415bd75Srobert 
443*d415bd75Srobert NOTICE TO USER: Carefully read the following legal agreement.
444*d415bd75Srobert BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
445*d415bd75Srobert DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
446*d415bd75Srobert YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
447*d415bd75Srobert TERMS AND CONDITIONS OF THIS AGREEMENT.
448*d415bd75Srobert IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
449*d415bd75Srobert THE DATA FILES OR SOFTWARE.
450*d415bd75Srobert 
451*d415bd75Srobert COPYRIGHT AND PERMISSION NOTICE
452*d415bd75Srobert 
453*d415bd75Srobert Copyright © 1991-2022 Unicode, Inc. All rights reserved.
454*d415bd75Srobert Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
455*d415bd75Srobert 
456*d415bd75Srobert Permission is hereby granted, free of charge, to any person obtaining
457*d415bd75Srobert a copy of the Unicode data files and any associated documentation
458*d415bd75Srobert (the "Data Files") or Unicode software and any associated documentation
459*d415bd75Srobert (the "Software") to deal in the Data Files or Software
460*d415bd75Srobert without restriction, including without limitation the rights to use,
461*d415bd75Srobert copy, modify, merge, publish, distribute, and/or sell copies of
462*d415bd75Srobert the Data Files or Software, and to permit persons to whom the Data Files
463*d415bd75Srobert or Software are furnished to do so, provided that either
464*d415bd75Srobert (a) this copyright and permission notice appear with all copies
465*d415bd75Srobert of the Data Files or Software, or
466*d415bd75Srobert (b) this copyright and permission notice appear in associated
467*d415bd75Srobert Documentation.
468*d415bd75Srobert 
469*d415bd75Srobert THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
470*d415bd75Srobert ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
471*d415bd75Srobert WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
472*d415bd75Srobert NONINFRINGEMENT OF THIRD PARTY RIGHTS.
473*d415bd75Srobert IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
474*d415bd75Srobert NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
475*d415bd75Srobert DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
476*d415bd75Srobert DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
477*d415bd75Srobert TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
478*d415bd75Srobert PERFORMANCE OF THE DATA FILES OR SOFTWARE.
479*d415bd75Srobert 
480*d415bd75Srobert Except as contained in this notice, the name of a copyright holder
481*d415bd75Srobert shall not be used in advertising or otherwise to promote the sale,
482*d415bd75Srobert use or other dealings in these Data Files or Software without prior
483*d415bd75Srobert written authorization of the copyright holder.
484*d415bd75Srobert */
485*d415bd75Srobert )";
486