1c92056d0SCorentin Jabot //===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
2c92056d0SCorentin Jabot //
3c92056d0SCorentin Jabot // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4c92056d0SCorentin Jabot // See https://llvm.org/LICENSE.txt for license information.
5c92056d0SCorentin Jabot // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6c92056d0SCorentin Jabot //
7c92056d0SCorentin Jabot //===----------------------------------------------------------------------===//
8c92056d0SCorentin Jabot //
9c92056d0SCorentin Jabot // This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
10c92056d0SCorentin Jabot // using UnicodeData.txt and NameAliases.txt available at
11*03e43cf1Scor3ntin // https://unicode.org/Public/15.1.0/ucd/
12c92056d0SCorentin Jabot //===----------------------------------------------------------------------===//
13c92056d0SCorentin Jabot
14c92056d0SCorentin Jabot #include "llvm/ADT/STLExtras.h"
15c92056d0SCorentin Jabot #include "llvm/ADT/StringExtras.h"
16c92056d0SCorentin Jabot #include "llvm/ADT/StringRef.h"
17c92056d0SCorentin Jabot #include <algorithm>
18c92056d0SCorentin Jabot #include <array>
19c92056d0SCorentin Jabot #include <deque>
20c92056d0SCorentin Jabot #include <fstream>
21c92056d0SCorentin Jabot #include <memory>
22da6642a1SKazu Hirata #include <optional>
23c92056d0SCorentin Jabot #include <set>
24c92056d0SCorentin Jabot #include <string>
25c92056d0SCorentin Jabot #include <unordered_map>
26c92056d0SCorentin Jabot #include <utility>
27c92056d0SCorentin Jabot #include <vector>
28c92056d0SCorentin Jabot
29c92056d0SCorentin Jabot static const llvm::StringRef Letters =
30c92056d0SCorentin Jabot " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
31c92056d0SCorentin Jabot
32c92056d0SCorentin Jabot // Collect names UnicodeData.txt and AliasNames.txt
33c92056d0SCorentin Jabot // There may be multiple names per code points.
34c92056d0SCorentin Jabot static std::unordered_multimap<char32_t, std::string>
loadDataFiles(const std::string & NamesFile,const std::string & AliasesFile)35c92056d0SCorentin Jabot loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
36c92056d0SCorentin Jabot std::unordered_multimap<char32_t, std::string> CollectedCharacters;
37c92056d0SCorentin Jabot auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
38c92056d0SCorentin Jabot std::ifstream InputFile(File);
39c92056d0SCorentin Jabot for (std::string Line; getline(InputFile, Line);) {
40c92056d0SCorentin Jabot if (Line.empty() || !isxdigit(Line[0]))
41c92056d0SCorentin Jabot continue;
42c92056d0SCorentin Jabot auto FirstSemiPos = Line.find(';');
43c92056d0SCorentin Jabot if (FirstSemiPos == std::string::npos)
44c92056d0SCorentin Jabot continue;
45c92056d0SCorentin Jabot auto SecondSemiPos = Line.find(';', FirstSemiPos + 1);
460057756fSCorentin Jabot if (SecondSemiPos == std::string::npos)
47c92056d0SCorentin Jabot continue;
48c92056d0SCorentin Jabot unsigned long long CodePoint;
49c92056d0SCorentin Jabot if (llvm::getAsUnsignedInteger(
50c92056d0SCorentin Jabot llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) {
51c92056d0SCorentin Jabot continue;
52c92056d0SCorentin Jabot }
53c92056d0SCorentin Jabot
54c92056d0SCorentin Jabot std::string Name =
55c92056d0SCorentin Jabot Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1);
56c92056d0SCorentin Jabot
57c92056d0SCorentin Jabot if (!Name.empty() && Name[0] == '<') {
58c92056d0SCorentin Jabot // Ignore ranges of characters, as their name is either absent or
59c92056d0SCorentin Jabot // generated.
60c92056d0SCorentin Jabot continue;
61c92056d0SCorentin Jabot }
62c92056d0SCorentin Jabot
63c92056d0SCorentin Jabot // Some aliases are ignored for compatibility with C++
64c92056d0SCorentin Jabot if (IsAliasFile) {
65c92056d0SCorentin Jabot std::string Kind = Line.substr(SecondSemiPos + 1);
66c92056d0SCorentin Jabot if (Kind != "control" && Kind != "correction" && Kind != "alternate")
67c92056d0SCorentin Jabot continue;
68c92056d0SCorentin Jabot }
69c92056d0SCorentin Jabot
70c92056d0SCorentin Jabot auto InsertUnique = [&](char32_t CP, std::string Name) {
71c92056d0SCorentin Jabot auto It = CollectedCharacters.find(CP);
72c92056d0SCorentin Jabot while (It != std::end(CollectedCharacters) && It->first == CP) {
73c92056d0SCorentin Jabot if (It->second == Name)
74c92056d0SCorentin Jabot return;
75c92056d0SCorentin Jabot ++It;
76c92056d0SCorentin Jabot }
77c92056d0SCorentin Jabot CollectedCharacters.insert({CP, std::move(Name)});
78c92056d0SCorentin Jabot };
79c92056d0SCorentin Jabot InsertUnique(CodePoint, std::move(Name));
80c92056d0SCorentin Jabot }
81c92056d0SCorentin Jabot };
82c92056d0SCorentin Jabot
83c92056d0SCorentin Jabot FromFile(NamesFile);
84c92056d0SCorentin Jabot FromFile(AliasesFile, true);
85c92056d0SCorentin Jabot return CollectedCharacters;
86c92056d0SCorentin Jabot }
87c92056d0SCorentin Jabot
88c92056d0SCorentin Jabot class Trie {
89c92056d0SCorentin Jabot struct Node;
90c92056d0SCorentin Jabot
91c92056d0SCorentin Jabot public:
92c92056d0SCorentin Jabot // When inserting named codepoint
93c92056d0SCorentin Jabot // We create a node per character in the name.
94c92056d0SCorentin Jabot // SPARKLE becomes S <- P <- A <- R <- K <- L <- E
95c92056d0SCorentin Jabot // Once all characters are inserted, the tree is compacted
insert(llvm::StringRef Name,char32_t Codepoint)96c92056d0SCorentin Jabot void insert(llvm::StringRef Name, char32_t Codepoint) {
97c92056d0SCorentin Jabot Node *N = Root.get();
9868410fbeSCorentin Jabot bool IsBeforeMedial = false;
9968410fbeSCorentin Jabot for (auto ChIt = Name.begin(); ChIt != Name.end();
10068410fbeSCorentin Jabot ChIt += (IsBeforeMedial ? 3 : 1)) {
10168410fbeSCorentin Jabot char Ch = *ChIt;
10268410fbeSCorentin Jabot assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name");
10368410fbeSCorentin Jabot
104c92056d0SCorentin Jabot std::string Label(1, Ch);
10568410fbeSCorentin Jabot
10668410fbeSCorentin Jabot // We need to ensure a node never ends or starts by
10768410fbeSCorentin Jabot // a medial hyphen as this would break the
10868410fbeSCorentin Jabot // loose matching algorithm.
10968410fbeSCorentin Jabot IsBeforeMedial = llvm::isAlnum(Ch) && ChIt + 1 != Name.end() &&
11068410fbeSCorentin Jabot *(ChIt + 1) == '-' && ChIt + 2 != Name.end() &&
11168410fbeSCorentin Jabot llvm::isAlnum(*(ChIt + 2));
11268410fbeSCorentin Jabot if (IsBeforeMedial)
11368410fbeSCorentin Jabot Label.assign(ChIt, ChIt + 3);
11468410fbeSCorentin Jabot
115ce9f007cSKazu Hirata auto It = llvm::find_if(N->Children,
116c92056d0SCorentin Jabot [&](const auto &C) { return C->Name == Label; });
117c92056d0SCorentin Jabot if (It == N->Children.end()) {
118c92056d0SCorentin Jabot It = N->Children.insert(It, std::make_unique<Node>(Label, N));
119c92056d0SCorentin Jabot }
120c92056d0SCorentin Jabot N = It->get();
121c92056d0SCorentin Jabot }
122c92056d0SCorentin Jabot N->Value = Codepoint;
123c92056d0SCorentin Jabot }
124c92056d0SCorentin Jabot
compact()125c92056d0SCorentin Jabot void compact() { compact(Root.get()); }
126c92056d0SCorentin Jabot
127c92056d0SCorentin Jabot // This creates 2 arrays of bytes from the tree:
128c92056d0SCorentin Jabot // A serialized dictionary of node labels,
129c92056d0SCorentin Jabot // And the nodes themselves.
130c92056d0SCorentin Jabot // The name of each label is found by indexing into the dictionary.
131c92056d0SCorentin Jabot // The longest names are inserted first into the dictionary,
132c92056d0SCorentin Jabot // in the hope it will contain shorter labels as substring,
133c92056d0SCorentin Jabot // thereby reducing duplication.
134c92056d0SCorentin Jabot // We could theorically be more clever by trying to minimizing the size
135c92056d0SCorentin Jabot // of the dictionary.
serialize()136c92056d0SCorentin Jabot std::pair<std::string, std::vector<uint8_t>> serialize() {
137c92056d0SCorentin Jabot std::set<std::string> Names = this->getNameFragments();
138c92056d0SCorentin Jabot std::vector<std::string> Sorted(Names.begin(), Names.end());
139aba43035SDmitri Gribenko llvm::sort(Sorted, [](const auto &a, const auto &b) {
140aba43035SDmitri Gribenko return a.size() > b.size();
141aba43035SDmitri Gribenko });
142c92056d0SCorentin Jabot std::string Dict(Letters.begin(), Letters.end());
143c92056d0SCorentin Jabot Dict.reserve(50000);
144c92056d0SCorentin Jabot for (const std::string &Name : Sorted) {
145c92056d0SCorentin Jabot if (Name.size() <= 1)
146c92056d0SCorentin Jabot continue;
147c92056d0SCorentin Jabot if (Dict.find(Name) != std::string::npos)
148c92056d0SCorentin Jabot continue;
149c92056d0SCorentin Jabot Dict += Name;
150c92056d0SCorentin Jabot }
151c92056d0SCorentin Jabot
152c92056d0SCorentin Jabot if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
153c92056d0SCorentin Jabot fprintf(stderr, "Dictionary too big to be serialized");
154c92056d0SCorentin Jabot exit(1);
155c92056d0SCorentin Jabot }
156c92056d0SCorentin Jabot
157c92056d0SCorentin Jabot auto Bytes = dumpIndex(Dict);
158c92056d0SCorentin Jabot return {Dict, Bytes};
159c92056d0SCorentin Jabot }
160c92056d0SCorentin Jabot
getNameFragments()161c92056d0SCorentin Jabot std::set<std::string> getNameFragments() {
162c92056d0SCorentin Jabot std::set<std::string> Keys;
163c92056d0SCorentin Jabot collectKeys(Root.get(), Keys);
164c92056d0SCorentin Jabot return Keys;
165c92056d0SCorentin Jabot }
166c92056d0SCorentin Jabot
167c92056d0SCorentin Jabot // Maps a valid char in an Unicode character name
168c92056d0SCorentin Jabot // To a 6 bits index.
letter(char C)169c92056d0SCorentin Jabot static uint8_t letter(char C) {
170c92056d0SCorentin Jabot auto Pos = Letters.find(C);
171c92056d0SCorentin Jabot assert(Pos != std::string::npos &&
172c92056d0SCorentin Jabot "Invalid letter in Unicode character name");
173c92056d0SCorentin Jabot return Pos;
174c92056d0SCorentin Jabot }
175c92056d0SCorentin Jabot
176c92056d0SCorentin Jabot // clang-format off
177c92056d0SCorentin Jabot // +================+============+======================+=============+========+===+==============+===============+
178c92056d0SCorentin Jabot // | 0 | 1 | 2-7 (6) | 8-23 | 24-44 | | 46 | 47 |
179c92056d0SCorentin Jabot // +================+============+======================+=============+========+===+==============+===============+
180c92056d0SCorentin Jabot // | Has Value | Has Long Name | Letter OR Name Size | Dict Index | Value | | Has Sibling | Has Children |
181c92056d0SCorentin Jabot // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
182c92056d0SCorentin Jabot // clang-format on
183c92056d0SCorentin Jabot
dumpIndex(const std::string & Dict)184c92056d0SCorentin Jabot std::vector<uint8_t> dumpIndex(const std::string &Dict) {
185c92056d0SCorentin Jabot struct ChildrenOffset {
186c92056d0SCorentin Jabot Node *FirstChild;
187c92056d0SCorentin Jabot std::size_t Offset;
188c92056d0SCorentin Jabot bool HasValue;
189c92056d0SCorentin Jabot };
190c92056d0SCorentin Jabot
191c92056d0SCorentin Jabot // Keep track of the start of each node
192c92056d0SCorentin Jabot // position in the serialized data.
193c92056d0SCorentin Jabot std::unordered_map<Node *, int32_t> Offsets;
194c92056d0SCorentin Jabot
195c92056d0SCorentin Jabot // Keep track of where to write the index
196c92056d0SCorentin Jabot // of the first children
197c92056d0SCorentin Jabot std::vector<ChildrenOffset> ChildrenOffsets;
198c92056d0SCorentin Jabot std::unordered_map<Node *, bool> SiblingTracker;
199c92056d0SCorentin Jabot std::deque<Node *> AllNodes;
200c92056d0SCorentin Jabot std::vector<uint8_t> Bytes;
201c92056d0SCorentin Jabot Bytes.reserve(250'000);
202c92056d0SCorentin Jabot // This leading byte is used by the reading code to detect the root node.
203c92056d0SCorentin Jabot Bytes.push_back(0);
204c92056d0SCorentin Jabot
205c92056d0SCorentin Jabot auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
206c92056d0SCorentin Jabot for (std::size_t Index = 0; Index < Children.size(); Index++) {
207c92056d0SCorentin Jabot const std::unique_ptr<Node> &Child = Children[Index];
208c92056d0SCorentin Jabot AllNodes.push_back(Child.get());
209c92056d0SCorentin Jabot if (Index != Children.size() - 1)
210c92056d0SCorentin Jabot SiblingTracker[Child.get()] = true;
211c92056d0SCorentin Jabot }
212c92056d0SCorentin Jabot };
213c92056d0SCorentin Jabot CollectChildren(Root->Children);
214c92056d0SCorentin Jabot
215c92056d0SCorentin Jabot while (!AllNodes.empty()) {
216c92056d0SCorentin Jabot const std::size_t Offset = Bytes.size();
217c92056d0SCorentin Jabot Node *const N = AllNodes.front();
218c92056d0SCorentin Jabot AllNodes.pop_front();
219c92056d0SCorentin Jabot
220c92056d0SCorentin Jabot assert(!N->Name.empty());
221c92056d0SCorentin Jabot Offsets[N] = Offset;
222c92056d0SCorentin Jabot
223c92056d0SCorentin Jabot uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
224c92056d0SCorentin Jabot // Single letter node are indexed in 6 bits
225c92056d0SCorentin Jabot if (N->Name.size() == 1) {
226c92056d0SCorentin Jabot FirstByte |= letter(N->Name[0]);
227c92056d0SCorentin Jabot Bytes.push_back(FirstByte);
228c92056d0SCorentin Jabot } else {
229c92056d0SCorentin Jabot // Otherwise we use a 16 bits index
230c92056d0SCorentin Jabot FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
231c92056d0SCorentin Jabot Bytes.push_back(FirstByte);
232c92056d0SCorentin Jabot auto PosInDict = Dict.find(N->Name);
233c92056d0SCorentin Jabot assert(PosInDict != std::string::npos);
234c92056d0SCorentin Jabot uint8_t Low = PosInDict;
235c92056d0SCorentin Jabot uint8_t High = ((PosInDict >> 8) & 0xFF);
236c92056d0SCorentin Jabot Bytes.push_back(High);
237c92056d0SCorentin Jabot Bytes.push_back(Low);
238c92056d0SCorentin Jabot }
239c92056d0SCorentin Jabot
240c92056d0SCorentin Jabot const bool HasSibling = SiblingTracker.count(N) != 0;
241c92056d0SCorentin Jabot const bool HasChildren = N->Children.size() != 0;
242c92056d0SCorentin Jabot
243c92056d0SCorentin Jabot if (!!N->Value) {
244c92056d0SCorentin Jabot uint32_t Value = (*(N->Value) << 3);
245c92056d0SCorentin Jabot uint8_t H = ((Value >> 16) & 0xFF);
246c92056d0SCorentin Jabot uint8_t M = ((Value >> 8) & 0xFF);
247c92056d0SCorentin Jabot uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
248c92056d0SCorentin Jabot uint8_t(HasChildren ? 0x02 : 0);
249c92056d0SCorentin Jabot
250c92056d0SCorentin Jabot Bytes.push_back(H);
251c92056d0SCorentin Jabot Bytes.push_back(M);
252c92056d0SCorentin Jabot Bytes.push_back(L);
253c92056d0SCorentin Jabot
254c92056d0SCorentin Jabot if (HasChildren) {
255c92056d0SCorentin Jabot ChildrenOffsets.push_back(
256c92056d0SCorentin Jabot ChildrenOffset{N->Children[0].get(), Bytes.size(), true});
257c92056d0SCorentin Jabot // index of the first children
258c92056d0SCorentin Jabot Bytes.push_back(0x00);
259c92056d0SCorentin Jabot Bytes.push_back(0x00);
260c92056d0SCorentin Jabot Bytes.push_back(0x00);
261c92056d0SCorentin Jabot }
262c92056d0SCorentin Jabot } else {
263c92056d0SCorentin Jabot // When there is no value (that's most intermediate nodes)
264c92056d0SCorentin Jabot // Dispense of the 3 values bytes, and only store
265a2d45017SKazu Hirata // 1 byte to track whether the node has sibling and children
266c92056d0SCorentin Jabot // + 2 bytes for the index of the first children if necessary.
267c92056d0SCorentin Jabot // That index also uses bytes 0-6 of the previous byte.
268c92056d0SCorentin Jabot uint8_t Byte =
269c92056d0SCorentin Jabot uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
270c92056d0SCorentin Jabot Bytes.push_back(Byte);
271c92056d0SCorentin Jabot if (HasChildren) {
272c92056d0SCorentin Jabot ChildrenOffsets.emplace_back(
273c92056d0SCorentin Jabot ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false});
274c92056d0SCorentin Jabot Bytes.push_back(0x00);
275c92056d0SCorentin Jabot Bytes.push_back(0x00);
276c92056d0SCorentin Jabot }
277c92056d0SCorentin Jabot }
278c92056d0SCorentin Jabot CollectChildren(N->Children);
279c92056d0SCorentin Jabot }
280c92056d0SCorentin Jabot
281c92056d0SCorentin Jabot // Once all the nodes are in the inndex
282c92056d0SCorentin Jabot // Fill the bytes we left to indicate the position
283c92056d0SCorentin Jabot // of the children
284c92056d0SCorentin Jabot for (const ChildrenOffset &Parent : ChildrenOffsets) {
285c92056d0SCorentin Jabot const auto It = Offsets.find(Parent.FirstChild);
286c92056d0SCorentin Jabot assert(It != Offsets.end());
287c92056d0SCorentin Jabot std::size_t Pos = It->second;
288c92056d0SCorentin Jabot if (Parent.HasValue) {
289c92056d0SCorentin Jabot Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
290c92056d0SCorentin Jabot } else {
291c92056d0SCorentin Jabot Bytes[Parent.Offset] =
292c92056d0SCorentin Jabot Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
293c92056d0SCorentin Jabot }
294c92056d0SCorentin Jabot Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
295c92056d0SCorentin Jabot Bytes[Parent.Offset + 2] = Pos & 0xFF;
296c92056d0SCorentin Jabot }
297c92056d0SCorentin Jabot
298c92056d0SCorentin Jabot // Add some padding so that the deserialization code
299c92056d0SCorentin Jabot // doesn't try to read past the enf of the array.
300c92056d0SCorentin Jabot Bytes.push_back(0);
301c92056d0SCorentin Jabot Bytes.push_back(0);
302c92056d0SCorentin Jabot Bytes.push_back(0);
303c92056d0SCorentin Jabot Bytes.push_back(0);
304c92056d0SCorentin Jabot Bytes.push_back(0);
305c92056d0SCorentin Jabot Bytes.push_back(0);
306c92056d0SCorentin Jabot
307c92056d0SCorentin Jabot return Bytes;
308c92056d0SCorentin Jabot }
309c92056d0SCorentin Jabot
310c92056d0SCorentin Jabot private:
collectKeys(Node * N,std::set<std::string> & Keys)311c92056d0SCorentin Jabot void collectKeys(Node *N, std::set<std::string> &Keys) {
312c92056d0SCorentin Jabot Keys.insert(N->Name);
313c92056d0SCorentin Jabot for (const std::unique_ptr<Node> &Child : N->Children) {
314c92056d0SCorentin Jabot collectKeys(Child.get(), Keys);
315c92056d0SCorentin Jabot }
316c92056d0SCorentin Jabot }
317c92056d0SCorentin Jabot
318c92056d0SCorentin Jabot // Merge sequences of 1-character nodes
319c92056d0SCorentin Jabot // This greatly reduce the total number of nodes,
320c92056d0SCorentin Jabot // and therefore the size of the index.
321c92056d0SCorentin Jabot // When the tree gets serialized, we only have 5 bytes to store the
322c92056d0SCorentin Jabot // size of a name. Overlong names (>32 characters) are therefore
323c92056d0SCorentin Jabot // kep into separate nodes
compact(Node * N)324c92056d0SCorentin Jabot void compact(Node *N) {
325c92056d0SCorentin Jabot for (auto &&Child : N->Children) {
326c92056d0SCorentin Jabot compact(Child.get());
327c92056d0SCorentin Jabot }
328c92056d0SCorentin Jabot if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
329c92056d0SCorentin Jabot (N->Parent->Name.size() + N->Name.size() <= 32)) {
330c92056d0SCorentin Jabot N->Parent->Value = N->Value;
331c92056d0SCorentin Jabot N->Parent->Name += N->Name;
332c92056d0SCorentin Jabot N->Parent->Children = std::move(N->Children);
333c92056d0SCorentin Jabot for (std::unique_ptr<Node> &c : N->Parent->Children) {
334c92056d0SCorentin Jabot c->Parent = N->Parent;
335c92056d0SCorentin Jabot }
336c92056d0SCorentin Jabot }
337c92056d0SCorentin Jabot }
338c92056d0SCorentin Jabot struct Node {
NodeTrie::Node339c92056d0SCorentin Jabot Node(std::string Name, Node *Parent = nullptr)
340c92056d0SCorentin Jabot : Name(Name), Parent(Parent) {}
341c92056d0SCorentin Jabot
342c92056d0SCorentin Jabot std::vector<std::unique_ptr<Node>> Children;
343c92056d0SCorentin Jabot std::string Name;
344c92056d0SCorentin Jabot Node *Parent = nullptr;
34577c90c8cSKazu Hirata std::optional<char32_t> Value;
346c92056d0SCorentin Jabot };
347c92056d0SCorentin Jabot
348c92056d0SCorentin Jabot std::unique_ptr<Node> Root = std::make_unique<Node>("");
349c92056d0SCorentin Jabot };
350c92056d0SCorentin Jabot
351c92056d0SCorentin Jabot extern const char *UnicodeLicense;
352c92056d0SCorentin Jabot
main(int argc,char ** argv)353c92056d0SCorentin Jabot int main(int argc, char **argv) {
354c92056d0SCorentin Jabot printf("Unicode name -> codepoint mapping generator\n"
355c92056d0SCorentin Jabot "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
356c92056d0SCorentin Jabot argv[0]);
357c92056d0SCorentin Jabot printf("NameAliases.txt can be found at "
358*03e43cf1Scor3ntin "https://unicode.org/Public/15.1.0/ucd/NameAliases.txt\n"
359c92056d0SCorentin Jabot "UnicodeData.txt can be found at "
360*03e43cf1Scor3ntin "https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt\n\n");
361c92056d0SCorentin Jabot
362c92056d0SCorentin Jabot if (argc != 4)
363c92056d0SCorentin Jabot return EXIT_FAILURE;
364c92056d0SCorentin Jabot
365c92056d0SCorentin Jabot FILE *Out = fopen(argv[3], "w");
366c92056d0SCorentin Jabot if (!Out) {
367c92056d0SCorentin Jabot printf("Error creating output file.\n");
368c92056d0SCorentin Jabot return EXIT_FAILURE;
369c92056d0SCorentin Jabot }
370c92056d0SCorentin Jabot
371c92056d0SCorentin Jabot Trie T;
372c92056d0SCorentin Jabot uint32_t NameCount = 0;
373c92056d0SCorentin Jabot std::size_t LongestName = 0;
374c92056d0SCorentin Jabot auto Entries = loadDataFiles(argv[1], argv[2]);
375c92056d0SCorentin Jabot for (const std::pair<const char32_t, std::string> &Entry : Entries) {
376c92056d0SCorentin Jabot char32_t Codepoint = Entry.first;
377c92056d0SCorentin Jabot const std::string &Name = Entry.second;
378c92056d0SCorentin Jabot // Ignore names which are not valid.
3798bdf3878SKazu Hirata if (Name.empty() ||
3808bdf3878SKazu Hirata !llvm::all_of(Name, [](char C) { return Letters.contains(C); })) {
381c92056d0SCorentin Jabot continue;
382c92056d0SCorentin Jabot }
38392d31a7cSAaron Ballman printf("%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
384c92056d0SCorentin Jabot T.insert(Name, Codepoint);
385c92056d0SCorentin Jabot LongestName =
386380a1b20SKazu Hirata std::max(LongestName, std::size_t(llvm::count_if(Name, llvm::isAlnum)));
387c92056d0SCorentin Jabot NameCount++;
388c92056d0SCorentin Jabot }
389c92056d0SCorentin Jabot T.compact();
390c92056d0SCorentin Jabot
391c92056d0SCorentin Jabot std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
392c92056d0SCorentin Jabot const std::string &Dict = Data.first;
393c92056d0SCorentin Jabot const std::vector<uint8_t> &Tree = Data.second;
394c92056d0SCorentin Jabot
395c92056d0SCorentin Jabot fprintf(Out, R"(
396c92056d0SCorentin Jabot //===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
397c92056d0SCorentin Jabot // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
398c92056d0SCorentin Jabot // See https://llvm.org/LICENSE.txt for license information.
399c92056d0SCorentin Jabot // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
400c92056d0SCorentin Jabot //
401c92056d0SCorentin Jabot //===----------------------------------------------------------------------===//
402c92056d0SCorentin Jabot //
403c92056d0SCorentin Jabot // This file implements mapping the name of a unicode code point to its value.
404c92056d0SCorentin Jabot //
405c92056d0SCorentin Jabot // This file was generated using %s.
406c92056d0SCorentin Jabot // Do not edit manually.
407c92056d0SCorentin Jabot //
408c92056d0SCorentin Jabot //===----------------------------------------------------------------------===//
409c92056d0SCorentin Jabot %s
410c92056d0SCorentin Jabot
411c92056d0SCorentin Jabot
412c92056d0SCorentin Jabot
413c92056d0SCorentin Jabot #include "llvm/Support/Compiler.h"
414c92056d0SCorentin Jabot #include <cstddef>
415c92056d0SCorentin Jabot #include <cstdint>
416c92056d0SCorentin Jabot )",
417c92056d0SCorentin Jabot argv[0], UnicodeLicense);
418c92056d0SCorentin Jabot
419c92056d0SCorentin Jabot fprintf(Out,
420c92056d0SCorentin Jabot "namespace llvm { namespace sys { namespace unicode { \n"
421c92056d0SCorentin Jabot "extern const char *UnicodeNameToCodepointDict;\n"
422c92056d0SCorentin Jabot "extern const uint8_t *UnicodeNameToCodepointIndex;\n"
423c92056d0SCorentin Jabot "extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
424c92056d0SCorentin Jabot "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
425c92056d0SCorentin Jabot
426c92056d0SCorentin Jabot fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n",
427c92056d0SCorentin Jabot Dict.c_str());
428c92056d0SCorentin Jabot
42992d31a7cSAaron Ballman fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
430c92056d0SCorentin Jabot Tree.size() + 1);
431c92056d0SCorentin Jabot
432c92056d0SCorentin Jabot for (auto Byte : Tree) {
433c92056d0SCorentin Jabot fprintf(Out, "0x%02x,", Byte);
434c92056d0SCorentin Jabot }
435c92056d0SCorentin Jabot
436c92056d0SCorentin Jabot fprintf(Out, "0};");
437c92056d0SCorentin Jabot fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = "
438c92056d0SCorentin Jabot "UnicodeNameToCodepointIndex_; \n");
43992d31a7cSAaron Ballman fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
440c92056d0SCorentin Jabot Tree.size() + 1);
441c92056d0SCorentin Jabot fprintf(Out,
44292d31a7cSAaron Ballman "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
443c92056d0SCorentin Jabot LongestName);
444c92056d0SCorentin Jabot fprintf(Out, "\n}}}\n");
445c92056d0SCorentin Jabot fclose(Out);
446c92056d0SCorentin Jabot printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
447c92056d0SCorentin Jabot argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
448c92056d0SCorentin Jabot }
449c92056d0SCorentin Jabot
450c92056d0SCorentin Jabot const char *UnicodeLicense = R"(
451c92056d0SCorentin Jabot /*
452c92056d0SCorentin Jabot UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
453c92056d0SCorentin Jabot
454c92056d0SCorentin Jabot See Terms of Use <https://www.unicode.org/copyright.html>
455c92056d0SCorentin Jabot for definitions of Unicode Inc.’s Data Files and Software.
456c92056d0SCorentin Jabot
457c92056d0SCorentin Jabot NOTICE TO USER: Carefully read the following legal agreement.
458c92056d0SCorentin Jabot BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
459c92056d0SCorentin Jabot DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
460c92056d0SCorentin Jabot YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
461c92056d0SCorentin Jabot TERMS AND CONDITIONS OF THIS AGREEMENT.
462c92056d0SCorentin Jabot IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
463c92056d0SCorentin Jabot THE DATA FILES OR SOFTWARE.
464c92056d0SCorentin Jabot
465c92056d0SCorentin Jabot COPYRIGHT AND PERMISSION NOTICE
466c92056d0SCorentin Jabot
467c92056d0SCorentin Jabot Copyright © 1991-2022 Unicode, Inc. All rights reserved.
468c92056d0SCorentin Jabot Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
469c92056d0SCorentin Jabot
470c92056d0SCorentin Jabot Permission is hereby granted, free of charge, to any person obtaining
471c92056d0SCorentin Jabot a copy of the Unicode data files and any associated documentation
472c92056d0SCorentin Jabot (the "Data Files") or Unicode software and any associated documentation
473c92056d0SCorentin Jabot (the "Software") to deal in the Data Files or Software
474c92056d0SCorentin Jabot without restriction, including without limitation the rights to use,
475c92056d0SCorentin Jabot copy, modify, merge, publish, distribute, and/or sell copies of
476c92056d0SCorentin Jabot the Data Files or Software, and to permit persons to whom the Data Files
477c92056d0SCorentin Jabot or Software are furnished to do so, provided that either
478c92056d0SCorentin Jabot (a) this copyright and permission notice appear with all copies
479c92056d0SCorentin Jabot of the Data Files or Software, or
480c92056d0SCorentin Jabot (b) this copyright and permission notice appear in associated
481c92056d0SCorentin Jabot Documentation.
482c92056d0SCorentin Jabot
483c92056d0SCorentin Jabot THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
484c92056d0SCorentin Jabot ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
485c92056d0SCorentin Jabot WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
486c92056d0SCorentin Jabot NONINFRINGEMENT OF THIRD PARTY RIGHTS.
487c92056d0SCorentin Jabot IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
488c92056d0SCorentin Jabot NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
489c92056d0SCorentin Jabot DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
490c92056d0SCorentin Jabot DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
491c92056d0SCorentin Jabot TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
492c92056d0SCorentin Jabot PERFORMANCE OF THE DATA FILES OR SOFTWARE.
493c92056d0SCorentin Jabot
494c92056d0SCorentin Jabot Except as contained in this notice, the name of a copyright holder
495c92056d0SCorentin Jabot shall not be used in advertising or otherwise to promote the sale,
496c92056d0SCorentin Jabot use or other dealings in these Data Files or Software without prior
497c92056d0SCorentin Jabot written authorization of the copyright holder.
498c92056d0SCorentin Jabot */
499c92056d0SCorentin Jabot )";
500