1*d415bd75Srobert //===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
2*d415bd75Srobert //
3*d415bd75Srobert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*d415bd75Srobert // See https://llvm.org/LICENSE.txt for license information.
5*d415bd75Srobert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*d415bd75Srobert //
7*d415bd75Srobert //===----------------------------------------------------------------------===//
8*d415bd75Srobert //
9*d415bd75Srobert // This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
10*d415bd75Srobert // using UnicodeData.txt and NameAliases.txt available at
11*d415bd75Srobert // https://unicode.org/Public/15.0.0/ucd/
12*d415bd75Srobert //===----------------------------------------------------------------------===//
13*d415bd75Srobert
14*d415bd75Srobert #include "llvm/ADT/STLExtras.h"
15*d415bd75Srobert #include "llvm/ADT/StringExtras.h"
16*d415bd75Srobert #include "llvm/ADT/StringRef.h"
17*d415bd75Srobert #include <algorithm>
18*d415bd75Srobert #include <array>
19*d415bd75Srobert #include <deque>
20*d415bd75Srobert #include <fstream>
21*d415bd75Srobert #include <memory>
22*d415bd75Srobert #include <optional>
23*d415bd75Srobert #include <set>
24*d415bd75Srobert #include <string>
25*d415bd75Srobert #include <unordered_map>
26*d415bd75Srobert #include <utility>
27*d415bd75Srobert #include <vector>
28*d415bd75Srobert
29*d415bd75Srobert static const llvm::StringRef Letters =
30*d415bd75Srobert " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
31*d415bd75Srobert
32*d415bd75Srobert // Collect names UnicodeData.txt and AliasNames.txt
33*d415bd75Srobert // There may be multiple names per code points.
34*d415bd75Srobert static std::unordered_multimap<char32_t, std::string>
loadDataFiles(const std::string & NamesFile,const std::string & AliasesFile)35*d415bd75Srobert loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
36*d415bd75Srobert std::unordered_multimap<char32_t, std::string> CollectedCharacters;
37*d415bd75Srobert auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
38*d415bd75Srobert std::ifstream InputFile(File);
39*d415bd75Srobert for (std::string Line; getline(InputFile, Line);) {
40*d415bd75Srobert if (Line.empty() || !isxdigit(Line[0]))
41*d415bd75Srobert continue;
42*d415bd75Srobert auto FirstSemiPos = Line.find(';');
43*d415bd75Srobert if (FirstSemiPos == std::string::npos)
44*d415bd75Srobert continue;
45*d415bd75Srobert auto SecondSemiPos = Line.find(';', FirstSemiPos + 1);
46*d415bd75Srobert if (SecondSemiPos == std::string::npos)
47*d415bd75Srobert continue;
48*d415bd75Srobert unsigned long long CodePoint;
49*d415bd75Srobert if (llvm::getAsUnsignedInteger(
50*d415bd75Srobert llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) {
51*d415bd75Srobert continue;
52*d415bd75Srobert }
53*d415bd75Srobert
54*d415bd75Srobert std::string Name =
55*d415bd75Srobert Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1);
56*d415bd75Srobert
57*d415bd75Srobert if (!Name.empty() && Name[0] == '<') {
58*d415bd75Srobert // Ignore ranges of characters, as their name is either absent or
59*d415bd75Srobert // generated.
60*d415bd75Srobert continue;
61*d415bd75Srobert }
62*d415bd75Srobert
63*d415bd75Srobert // Some aliases are ignored for compatibility with C++
64*d415bd75Srobert if (IsAliasFile) {
65*d415bd75Srobert std::string Kind = Line.substr(SecondSemiPos + 1);
66*d415bd75Srobert if (Kind != "control" && Kind != "correction" && Kind != "alternate")
67*d415bd75Srobert continue;
68*d415bd75Srobert }
69*d415bd75Srobert
70*d415bd75Srobert auto InsertUnique = [&](char32_t CP, std::string Name) {
71*d415bd75Srobert auto It = CollectedCharacters.find(CP);
72*d415bd75Srobert while (It != std::end(CollectedCharacters) && It->first == CP) {
73*d415bd75Srobert if (It->second == Name)
74*d415bd75Srobert return;
75*d415bd75Srobert ++It;
76*d415bd75Srobert }
77*d415bd75Srobert CollectedCharacters.insert({CP, std::move(Name)});
78*d415bd75Srobert };
79*d415bd75Srobert InsertUnique(CodePoint, std::move(Name));
80*d415bd75Srobert }
81*d415bd75Srobert };
82*d415bd75Srobert
83*d415bd75Srobert FromFile(NamesFile);
84*d415bd75Srobert FromFile(AliasesFile, true);
85*d415bd75Srobert return CollectedCharacters;
86*d415bd75Srobert }
87*d415bd75Srobert
88*d415bd75Srobert class Trie {
89*d415bd75Srobert struct Node;
90*d415bd75Srobert
91*d415bd75Srobert public:
92*d415bd75Srobert // When inserting named codepoint
93*d415bd75Srobert // We create a node per character in the name.
94*d415bd75Srobert // SPARKLE becomes S <- P <- A <- R <- K <- L <- E
95*d415bd75Srobert // Once all characters are inserted, the tree is compacted
insert(llvm::StringRef Name,char32_t Codepoint)96*d415bd75Srobert void insert(llvm::StringRef Name, char32_t Codepoint) {
97*d415bd75Srobert Node *N = Root.get();
98*d415bd75Srobert for (auto Ch : Name) {
99*d415bd75Srobert std::string Label(1, Ch);
100*d415bd75Srobert auto It = llvm::find_if(N->Children,
101*d415bd75Srobert [&](const auto &C) { return C->Name == Label; });
102*d415bd75Srobert if (It == N->Children.end()) {
103*d415bd75Srobert It = N->Children.insert(It, std::make_unique<Node>(Label, N));
104*d415bd75Srobert }
105*d415bd75Srobert N = It->get();
106*d415bd75Srobert }
107*d415bd75Srobert N->Value = Codepoint;
108*d415bd75Srobert }
109*d415bd75Srobert
compact()110*d415bd75Srobert void compact() { compact(Root.get()); }
111*d415bd75Srobert
112*d415bd75Srobert // This creates 2 arrays of bytes from the tree:
113*d415bd75Srobert // A serialized dictionary of node labels,
114*d415bd75Srobert // And the nodes themselves.
115*d415bd75Srobert // The name of each label is found by indexing into the dictionary.
116*d415bd75Srobert // The longest names are inserted first into the dictionary,
117*d415bd75Srobert // in the hope it will contain shorter labels as substring,
118*d415bd75Srobert // thereby reducing duplication.
119*d415bd75Srobert // We could theorically be more clever by trying to minimizing the size
120*d415bd75Srobert // of the dictionary.
serialize()121*d415bd75Srobert std::pair<std::string, std::vector<uint8_t>> serialize() {
122*d415bd75Srobert std::set<std::string> Names = this->getNameFragments();
123*d415bd75Srobert std::vector<std::string> Sorted(Names.begin(), Names.end());
124*d415bd75Srobert llvm::sort(Sorted, [](const auto &a, const auto &b) {
125*d415bd75Srobert return a.size() > b.size();
126*d415bd75Srobert });
127*d415bd75Srobert std::string Dict(Letters.begin(), Letters.end());
128*d415bd75Srobert Dict.reserve(50000);
129*d415bd75Srobert for (const std::string &Name : Sorted) {
130*d415bd75Srobert if (Name.size() <= 1)
131*d415bd75Srobert continue;
132*d415bd75Srobert if (Dict.find(Name) != std::string::npos)
133*d415bd75Srobert continue;
134*d415bd75Srobert Dict += Name;
135*d415bd75Srobert }
136*d415bd75Srobert
137*d415bd75Srobert if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
138*d415bd75Srobert fprintf(stderr, "Dictionary too big to be serialized");
139*d415bd75Srobert exit(1);
140*d415bd75Srobert }
141*d415bd75Srobert
142*d415bd75Srobert auto Bytes = dumpIndex(Dict);
143*d415bd75Srobert return {Dict, Bytes};
144*d415bd75Srobert }
145*d415bd75Srobert
getNameFragments()146*d415bd75Srobert std::set<std::string> getNameFragments() {
147*d415bd75Srobert std::set<std::string> Keys;
148*d415bd75Srobert collectKeys(Root.get(), Keys);
149*d415bd75Srobert return Keys;
150*d415bd75Srobert }
151*d415bd75Srobert
152*d415bd75Srobert // Maps a valid char in an Unicode character name
153*d415bd75Srobert // To a 6 bits index.
letter(char C)154*d415bd75Srobert static uint8_t letter(char C) {
155*d415bd75Srobert auto Pos = Letters.find(C);
156*d415bd75Srobert assert(Pos != std::string::npos &&
157*d415bd75Srobert "Invalid letter in Unicode character name");
158*d415bd75Srobert return Pos;
159*d415bd75Srobert }
160*d415bd75Srobert
161*d415bd75Srobert // clang-format off
162*d415bd75Srobert // +================+============+======================+=============+========+===+==============+===============+
163*d415bd75Srobert // | 0 | 1 | 2-7 (6) | 8-23 | 24-44 | | 46 | 47 |
164*d415bd75Srobert // +================+============+======================+=============+========+===+==============+===============+
165*d415bd75Srobert // | Has Value | Has Long Name | Letter OR Name Size | Dict Index | Value | | Has Sibling | Has Children |
166*d415bd75Srobert // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
167*d415bd75Srobert // clang-format on
168*d415bd75Srobert
dumpIndex(const std::string & Dict)169*d415bd75Srobert std::vector<uint8_t> dumpIndex(const std::string &Dict) {
170*d415bd75Srobert struct ChildrenOffset {
171*d415bd75Srobert Node *FirstChild;
172*d415bd75Srobert std::size_t Offset;
173*d415bd75Srobert bool HasValue;
174*d415bd75Srobert };
175*d415bd75Srobert
176*d415bd75Srobert // Keep track of the start of each node
177*d415bd75Srobert // position in the serialized data.
178*d415bd75Srobert std::unordered_map<Node *, int32_t> Offsets;
179*d415bd75Srobert
180*d415bd75Srobert // Keep track of where to write the index
181*d415bd75Srobert // of the first children
182*d415bd75Srobert std::vector<ChildrenOffset> ChildrenOffsets;
183*d415bd75Srobert std::unordered_map<Node *, bool> SiblingTracker;
184*d415bd75Srobert std::deque<Node *> AllNodes;
185*d415bd75Srobert std::vector<uint8_t> Bytes;
186*d415bd75Srobert Bytes.reserve(250'000);
187*d415bd75Srobert // This leading byte is used by the reading code to detect the root node.
188*d415bd75Srobert Bytes.push_back(0);
189*d415bd75Srobert
190*d415bd75Srobert auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
191*d415bd75Srobert for (std::size_t Index = 0; Index < Children.size(); Index++) {
192*d415bd75Srobert const std::unique_ptr<Node> &Child = Children[Index];
193*d415bd75Srobert AllNodes.push_back(Child.get());
194*d415bd75Srobert if (Index != Children.size() - 1)
195*d415bd75Srobert SiblingTracker[Child.get()] = true;
196*d415bd75Srobert }
197*d415bd75Srobert };
198*d415bd75Srobert CollectChildren(Root->Children);
199*d415bd75Srobert
200*d415bd75Srobert while (!AllNodes.empty()) {
201*d415bd75Srobert const std::size_t Offset = Bytes.size();
202*d415bd75Srobert Node *const N = AllNodes.front();
203*d415bd75Srobert AllNodes.pop_front();
204*d415bd75Srobert
205*d415bd75Srobert assert(!N->Name.empty());
206*d415bd75Srobert Offsets[N] = Offset;
207*d415bd75Srobert
208*d415bd75Srobert uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
209*d415bd75Srobert // Single letter node are indexed in 6 bits
210*d415bd75Srobert if (N->Name.size() == 1) {
211*d415bd75Srobert FirstByte |= letter(N->Name[0]);
212*d415bd75Srobert Bytes.push_back(FirstByte);
213*d415bd75Srobert } else {
214*d415bd75Srobert // Otherwise we use a 16 bits index
215*d415bd75Srobert FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
216*d415bd75Srobert Bytes.push_back(FirstByte);
217*d415bd75Srobert auto PosInDict = Dict.find(N->Name);
218*d415bd75Srobert assert(PosInDict != std::string::npos);
219*d415bd75Srobert uint8_t Low = PosInDict;
220*d415bd75Srobert uint8_t High = ((PosInDict >> 8) & 0xFF);
221*d415bd75Srobert Bytes.push_back(High);
222*d415bd75Srobert Bytes.push_back(Low);
223*d415bd75Srobert }
224*d415bd75Srobert
225*d415bd75Srobert const bool HasSibling = SiblingTracker.count(N) != 0;
226*d415bd75Srobert const bool HasChildren = N->Children.size() != 0;
227*d415bd75Srobert
228*d415bd75Srobert if (!!N->Value) {
229*d415bd75Srobert uint32_t Value = (*(N->Value) << 3);
230*d415bd75Srobert uint8_t H = ((Value >> 16) & 0xFF);
231*d415bd75Srobert uint8_t M = ((Value >> 8) & 0xFF);
232*d415bd75Srobert uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
233*d415bd75Srobert uint8_t(HasChildren ? 0x02 : 0);
234*d415bd75Srobert
235*d415bd75Srobert Bytes.push_back(H);
236*d415bd75Srobert Bytes.push_back(M);
237*d415bd75Srobert Bytes.push_back(L);
238*d415bd75Srobert
239*d415bd75Srobert if (HasChildren) {
240*d415bd75Srobert ChildrenOffsets.push_back(
241*d415bd75Srobert ChildrenOffset{N->Children[0].get(), Bytes.size(), true});
242*d415bd75Srobert // index of the first children
243*d415bd75Srobert Bytes.push_back(0x00);
244*d415bd75Srobert Bytes.push_back(0x00);
245*d415bd75Srobert Bytes.push_back(0x00);
246*d415bd75Srobert }
247*d415bd75Srobert } else {
248*d415bd75Srobert // When there is no value (that's most intermediate nodes)
249*d415bd75Srobert // Dispense of the 3 values bytes, and only store
250*d415bd75Srobert // 1 byte to track whether the node has sibling and children
251*d415bd75Srobert // + 2 bytes for the index of the first children if necessary.
252*d415bd75Srobert // That index also uses bytes 0-6 of the previous byte.
253*d415bd75Srobert uint8_t Byte =
254*d415bd75Srobert uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
255*d415bd75Srobert Bytes.push_back(Byte);
256*d415bd75Srobert if (HasChildren) {
257*d415bd75Srobert ChildrenOffsets.emplace_back(
258*d415bd75Srobert ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false});
259*d415bd75Srobert Bytes.push_back(0x00);
260*d415bd75Srobert Bytes.push_back(0x00);
261*d415bd75Srobert }
262*d415bd75Srobert }
263*d415bd75Srobert CollectChildren(N->Children);
264*d415bd75Srobert }
265*d415bd75Srobert
266*d415bd75Srobert // Once all the nodes are in the inndex
267*d415bd75Srobert // Fill the bytes we left to indicate the position
268*d415bd75Srobert // of the children
269*d415bd75Srobert for (const ChildrenOffset &Parent : ChildrenOffsets) {
270*d415bd75Srobert const auto It = Offsets.find(Parent.FirstChild);
271*d415bd75Srobert assert(It != Offsets.end());
272*d415bd75Srobert std::size_t Pos = It->second;
273*d415bd75Srobert if (Parent.HasValue) {
274*d415bd75Srobert Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
275*d415bd75Srobert } else {
276*d415bd75Srobert Bytes[Parent.Offset] =
277*d415bd75Srobert Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
278*d415bd75Srobert }
279*d415bd75Srobert Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
280*d415bd75Srobert Bytes[Parent.Offset + 2] = Pos & 0xFF;
281*d415bd75Srobert }
282*d415bd75Srobert
283*d415bd75Srobert // Add some padding so that the deserialization code
284*d415bd75Srobert // doesn't try to read past the enf of the array.
285*d415bd75Srobert Bytes.push_back(0);
286*d415bd75Srobert Bytes.push_back(0);
287*d415bd75Srobert Bytes.push_back(0);
288*d415bd75Srobert Bytes.push_back(0);
289*d415bd75Srobert Bytes.push_back(0);
290*d415bd75Srobert Bytes.push_back(0);
291*d415bd75Srobert
292*d415bd75Srobert return Bytes;
293*d415bd75Srobert }
294*d415bd75Srobert
295*d415bd75Srobert private:
collectKeys(Node * N,std::set<std::string> & Keys)296*d415bd75Srobert void collectKeys(Node *N, std::set<std::string> &Keys) {
297*d415bd75Srobert Keys.insert(N->Name);
298*d415bd75Srobert for (const std::unique_ptr<Node> &Child : N->Children) {
299*d415bd75Srobert collectKeys(Child.get(), Keys);
300*d415bd75Srobert }
301*d415bd75Srobert }
302*d415bd75Srobert
303*d415bd75Srobert // Merge sequences of 1-character nodes
304*d415bd75Srobert // This greatly reduce the total number of nodes,
305*d415bd75Srobert // and therefore the size of the index.
306*d415bd75Srobert // When the tree gets serialized, we only have 5 bytes to store the
307*d415bd75Srobert // size of a name. Overlong names (>32 characters) are therefore
308*d415bd75Srobert // kep into separate nodes
compact(Node * N)309*d415bd75Srobert void compact(Node *N) {
310*d415bd75Srobert for (auto &&Child : N->Children) {
311*d415bd75Srobert compact(Child.get());
312*d415bd75Srobert }
313*d415bd75Srobert if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
314*d415bd75Srobert (N->Parent->Name.size() + N->Name.size() <= 32)) {
315*d415bd75Srobert N->Parent->Value = N->Value;
316*d415bd75Srobert N->Parent->Name += N->Name;
317*d415bd75Srobert N->Parent->Children = std::move(N->Children);
318*d415bd75Srobert for (std::unique_ptr<Node> &c : N->Parent->Children) {
319*d415bd75Srobert c->Parent = N->Parent;
320*d415bd75Srobert }
321*d415bd75Srobert }
322*d415bd75Srobert }
323*d415bd75Srobert struct Node {
NodeTrie::Node324*d415bd75Srobert Node(std::string Name, Node *Parent = nullptr)
325*d415bd75Srobert : Name(Name), Parent(Parent) {}
326*d415bd75Srobert
327*d415bd75Srobert std::vector<std::unique_ptr<Node>> Children;
328*d415bd75Srobert std::string Name;
329*d415bd75Srobert Node *Parent = nullptr;
330*d415bd75Srobert std::optional<char32_t> Value;
331*d415bd75Srobert };
332*d415bd75Srobert
333*d415bd75Srobert std::unique_ptr<Node> Root = std::make_unique<Node>("");
334*d415bd75Srobert };
335*d415bd75Srobert
336*d415bd75Srobert extern const char *UnicodeLicense;
337*d415bd75Srobert
main(int argc,char ** argv)338*d415bd75Srobert int main(int argc, char **argv) {
339*d415bd75Srobert printf("Unicode name -> codepoint mapping generator\n"
340*d415bd75Srobert "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
341*d415bd75Srobert argv[0]);
342*d415bd75Srobert printf("NameAliases.txt can be found at "
343*d415bd75Srobert "https://unicode.org/Public/15.0.0/ucd/NameAliases.txt\n"
344*d415bd75Srobert "UnicodeData.txt can be found at "
345*d415bd75Srobert "https://unicode.org/Public/15.0.0/ucd/UnicodeData.txt\n\n");
346*d415bd75Srobert
347*d415bd75Srobert if (argc != 4)
348*d415bd75Srobert return EXIT_FAILURE;
349*d415bd75Srobert
350*d415bd75Srobert FILE *Out = fopen(argv[3], "w");
351*d415bd75Srobert if (!Out) {
352*d415bd75Srobert printf("Error creating output file.\n");
353*d415bd75Srobert return EXIT_FAILURE;
354*d415bd75Srobert }
355*d415bd75Srobert
356*d415bd75Srobert Trie T;
357*d415bd75Srobert uint32_t NameCount = 0;
358*d415bd75Srobert std::size_t LongestName = 0;
359*d415bd75Srobert auto Entries = loadDataFiles(argv[1], argv[2]);
360*d415bd75Srobert for (const std::pair<const char32_t, std::string> &Entry : Entries) {
361*d415bd75Srobert char32_t Codepoint = Entry.first;
362*d415bd75Srobert const std::string &Name = Entry.second;
363*d415bd75Srobert // Ignore names which are not valid.
364*d415bd75Srobert if (Name.empty() || !llvm::all_of(Name, [](char C) {
365*d415bd75Srobert return llvm::is_contained(Letters, C);
366*d415bd75Srobert })) {
367*d415bd75Srobert continue;
368*d415bd75Srobert }
369*d415bd75Srobert printf("%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
370*d415bd75Srobert T.insert(Name, Codepoint);
371*d415bd75Srobert LongestName =
372*d415bd75Srobert std::max(LongestName, std::size_t(llvm::count_if(Name, llvm::isAlnum)));
373*d415bd75Srobert NameCount++;
374*d415bd75Srobert }
375*d415bd75Srobert T.compact();
376*d415bd75Srobert
377*d415bd75Srobert std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
378*d415bd75Srobert const std::string &Dict = Data.first;
379*d415bd75Srobert const std::vector<uint8_t> &Tree = Data.second;
380*d415bd75Srobert
381*d415bd75Srobert fprintf(Out, R"(
382*d415bd75Srobert //===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
383*d415bd75Srobert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
384*d415bd75Srobert // See https://llvm.org/LICENSE.txt for license information.
385*d415bd75Srobert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
386*d415bd75Srobert //
387*d415bd75Srobert //===----------------------------------------------------------------------===//
388*d415bd75Srobert //
389*d415bd75Srobert // This file implements mapping the name of a unicode code point to its value.
390*d415bd75Srobert //
391*d415bd75Srobert // This file was generated using %s.
392*d415bd75Srobert // Do not edit manually.
393*d415bd75Srobert //
394*d415bd75Srobert //===----------------------------------------------------------------------===//
395*d415bd75Srobert %s
396*d415bd75Srobert
397*d415bd75Srobert
398*d415bd75Srobert
399*d415bd75Srobert #include "llvm/Support/Compiler.h"
400*d415bd75Srobert #include <cstddef>
401*d415bd75Srobert #include <cstdint>
402*d415bd75Srobert )",
403*d415bd75Srobert argv[0], UnicodeLicense);
404*d415bd75Srobert
405*d415bd75Srobert fprintf(Out,
406*d415bd75Srobert "namespace llvm { namespace sys { namespace unicode { \n"
407*d415bd75Srobert "extern const char *UnicodeNameToCodepointDict;\n"
408*d415bd75Srobert "extern const uint8_t *UnicodeNameToCodepointIndex;\n"
409*d415bd75Srobert "extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
410*d415bd75Srobert "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
411*d415bd75Srobert
412*d415bd75Srobert fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n",
413*d415bd75Srobert Dict.c_str());
414*d415bd75Srobert
415*d415bd75Srobert fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
416*d415bd75Srobert Tree.size() + 1);
417*d415bd75Srobert
418*d415bd75Srobert for (auto Byte : Tree) {
419*d415bd75Srobert fprintf(Out, "0x%02x,", Byte);
420*d415bd75Srobert }
421*d415bd75Srobert
422*d415bd75Srobert fprintf(Out, "0};");
423*d415bd75Srobert fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = "
424*d415bd75Srobert "UnicodeNameToCodepointIndex_; \n");
425*d415bd75Srobert fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
426*d415bd75Srobert Tree.size() + 1);
427*d415bd75Srobert fprintf(Out,
428*d415bd75Srobert "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
429*d415bd75Srobert LongestName);
430*d415bd75Srobert fprintf(Out, "\n}}}\n");
431*d415bd75Srobert fclose(Out);
432*d415bd75Srobert printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
433*d415bd75Srobert argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
434*d415bd75Srobert }
435*d415bd75Srobert
436*d415bd75Srobert const char *UnicodeLicense = R"(
437*d415bd75Srobert /*
438*d415bd75Srobert UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
439*d415bd75Srobert
440*d415bd75Srobert See Terms of Use <https://www.unicode.org/copyright.html>
441*d415bd75Srobert for definitions of Unicode Inc.’s Data Files and Software.
442*d415bd75Srobert
443*d415bd75Srobert NOTICE TO USER: Carefully read the following legal agreement.
444*d415bd75Srobert BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
445*d415bd75Srobert DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
446*d415bd75Srobert YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
447*d415bd75Srobert TERMS AND CONDITIONS OF THIS AGREEMENT.
448*d415bd75Srobert IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
449*d415bd75Srobert THE DATA FILES OR SOFTWARE.
450*d415bd75Srobert
451*d415bd75Srobert COPYRIGHT AND PERMISSION NOTICE
452*d415bd75Srobert
453*d415bd75Srobert Copyright © 1991-2022 Unicode, Inc. All rights reserved.
454*d415bd75Srobert Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
455*d415bd75Srobert
456*d415bd75Srobert Permission is hereby granted, free of charge, to any person obtaining
457*d415bd75Srobert a copy of the Unicode data files and any associated documentation
458*d415bd75Srobert (the "Data Files") or Unicode software and any associated documentation
459*d415bd75Srobert (the "Software") to deal in the Data Files or Software
460*d415bd75Srobert without restriction, including without limitation the rights to use,
461*d415bd75Srobert copy, modify, merge, publish, distribute, and/or sell copies of
462*d415bd75Srobert the Data Files or Software, and to permit persons to whom the Data Files
463*d415bd75Srobert or Software are furnished to do so, provided that either
464*d415bd75Srobert (a) this copyright and permission notice appear with all copies
465*d415bd75Srobert of the Data Files or Software, or
466*d415bd75Srobert (b) this copyright and permission notice appear in associated
467*d415bd75Srobert Documentation.
468*d415bd75Srobert
469*d415bd75Srobert THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
470*d415bd75Srobert ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
471*d415bd75Srobert WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
472*d415bd75Srobert NONINFRINGEMENT OF THIRD PARTY RIGHTS.
473*d415bd75Srobert IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
474*d415bd75Srobert NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
475*d415bd75Srobert DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
476*d415bd75Srobert DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
477*d415bd75Srobert TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
478*d415bd75Srobert PERFORMANCE OF THE DATA FILES OR SOFTWARE.
479*d415bd75Srobert
480*d415bd75Srobert Except as contained in this notice, the name of a copyright holder
481*d415bd75Srobert shall not be used in advertising or otherwise to promote the sale,
482*d415bd75Srobert use or other dealings in these Data Files or Software without prior
483*d415bd75Srobert written authorization of the copyright holder.
484*d415bd75Srobert */
485*d415bd75Srobert )";
486