xref: /llvm-project/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp (revision 03e43cf1c7ed0770b4afef915f82f2503a9f18fb)
1 //===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
10 // using UnicodeData.txt and NameAliases.txt available at
11 // https://unicode.org/Public/15.1.0/ucd/
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/StringExtras.h"
16 #include "llvm/ADT/StringRef.h"
17 #include <algorithm>
18 #include <array>
19 #include <deque>
20 #include <fstream>
21 #include <memory>
22 #include <optional>
23 #include <set>
24 #include <string>
25 #include <unordered_map>
26 #include <utility>
27 #include <vector>
28 
29 static const llvm::StringRef Letters =
30     " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
31 
32 // Collect names UnicodeData.txt and AliasNames.txt
33 // There may be multiple names per code points.
34 static std::unordered_multimap<char32_t, std::string>
loadDataFiles(const std::string & NamesFile,const std::string & AliasesFile)35 loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
36   std::unordered_multimap<char32_t, std::string> CollectedCharacters;
37   auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
38     std::ifstream InputFile(File);
39     for (std::string Line; getline(InputFile, Line);) {
40       if (Line.empty() || !isxdigit(Line[0]))
41         continue;
42       auto FirstSemiPos = Line.find(';');
43       if (FirstSemiPos == std::string::npos)
44         continue;
45       auto SecondSemiPos = Line.find(';', FirstSemiPos + 1);
46       if (SecondSemiPos == std::string::npos)
47         continue;
48       unsigned long long CodePoint;
49       if (llvm::getAsUnsignedInteger(
50               llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) {
51         continue;
52       }
53 
54       std::string Name =
55           Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1);
56 
57       if (!Name.empty() && Name[0] == '<') {
58         // Ignore ranges of characters, as their name is either absent or
59         // generated.
60         continue;
61       }
62 
63       // Some aliases are ignored for compatibility with C++
64       if (IsAliasFile) {
65         std::string Kind = Line.substr(SecondSemiPos + 1);
66         if (Kind != "control" && Kind != "correction" && Kind != "alternate")
67           continue;
68       }
69 
70       auto InsertUnique = [&](char32_t CP, std::string Name) {
71         auto It = CollectedCharacters.find(CP);
72         while (It != std::end(CollectedCharacters) && It->first == CP) {
73           if (It->second == Name)
74             return;
75           ++It;
76         }
77         CollectedCharacters.insert({CP, std::move(Name)});
78       };
79       InsertUnique(CodePoint, std::move(Name));
80     }
81   };
82 
83   FromFile(NamesFile);
84   FromFile(AliasesFile, true);
85   return CollectedCharacters;
86 }
87 
88 class Trie {
89   struct Node;
90 
91 public:
92   // When inserting named codepoint
93   // We create a node per character in the name.
94   // SPARKLE becomes S <- P <- A <- R <- K <- L <- E
95   // Once all  characters are inserted, the tree is compacted
insert(llvm::StringRef Name,char32_t Codepoint)96   void insert(llvm::StringRef Name, char32_t Codepoint) {
97     Node *N = Root.get();
98     bool IsBeforeMedial = false;
99     for (auto ChIt = Name.begin(); ChIt != Name.end();
100          ChIt += (IsBeforeMedial ? 3 : 1)) {
101       char Ch = *ChIt;
102       assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name");
103 
104       std::string Label(1, Ch);
105 
106       // We need to ensure a node never ends or starts by
107       // a medial hyphen as this would break the
108       // loose matching algorithm.
109       IsBeforeMedial = llvm::isAlnum(Ch) && ChIt + 1 != Name.end() &&
110                        *(ChIt + 1) == '-' && ChIt + 2 != Name.end() &&
111                        llvm::isAlnum(*(ChIt + 2));
112       if (IsBeforeMedial)
113         Label.assign(ChIt, ChIt + 3);
114 
115       auto It = llvm::find_if(N->Children,
116                               [&](const auto &C) { return C->Name == Label; });
117       if (It == N->Children.end()) {
118         It = N->Children.insert(It, std::make_unique<Node>(Label, N));
119       }
120       N = It->get();
121     }
122     N->Value = Codepoint;
123   }
124 
compact()125   void compact() { compact(Root.get()); }
126 
127   // This creates 2 arrays of bytes from the tree:
128   // A serialized dictionary of node labels,
129   // And the nodes themselves.
130   // The name of each label is found by indexing into the dictionary.
131   // The longest names are inserted first into the dictionary,
132   // in the hope it will contain shorter labels as substring,
133   // thereby reducing duplication.
134   // We could theorically be more clever by trying to minimizing the size
135   // of the dictionary.
serialize()136   std::pair<std::string, std::vector<uint8_t>> serialize() {
137     std::set<std::string> Names = this->getNameFragments();
138     std::vector<std::string> Sorted(Names.begin(), Names.end());
139     llvm::sort(Sorted, [](const auto &a, const auto &b) {
140       return a.size() > b.size();
141     });
142     std::string Dict(Letters.begin(), Letters.end());
143     Dict.reserve(50000);
144     for (const std::string &Name : Sorted) {
145       if (Name.size() <= 1)
146         continue;
147       if (Dict.find(Name) != std::string::npos)
148         continue;
149       Dict += Name;
150     }
151 
152     if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
153       fprintf(stderr, "Dictionary too big  to be serialized");
154       exit(1);
155     }
156 
157     auto Bytes = dumpIndex(Dict);
158     return {Dict, Bytes};
159   }
160 
getNameFragments()161   std::set<std::string> getNameFragments() {
162     std::set<std::string> Keys;
163     collectKeys(Root.get(), Keys);
164     return Keys;
165   }
166 
167   // Maps a valid char in an Unicode character name
168   // To a 6 bits index.
letter(char C)169   static uint8_t letter(char C) {
170     auto Pos = Letters.find(C);
171     assert(Pos != std::string::npos &&
172            "Invalid letter in Unicode character name");
173     return Pos;
174   }
175 
176   // clang-format off
177   // +================+============+======================+=============+========+===+==============+===============+
178   // | 0          | 1             | 2-7 (6)              | 8-23        | 24-44  |    | 46           | 47            |
179   // +================+============+======================+=============+========+===+==============+===============+
180   // | Has Value |  Has Long Name | Letter OR Name Size  | Dict Index  | Value  |    | Has Sibling  | Has Children  |
181   // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
182   // clang-format on
183 
dumpIndex(const std::string & Dict)184   std::vector<uint8_t> dumpIndex(const std::string &Dict) {
185     struct ChildrenOffset {
186       Node *FirstChild;
187       std::size_t Offset;
188       bool HasValue;
189     };
190 
191     // Keep track of the start of each node
192     // position in the serialized data.
193     std::unordered_map<Node *, int32_t> Offsets;
194 
195     // Keep track of where to write the index
196     // of the first children
197     std::vector<ChildrenOffset> ChildrenOffsets;
198     std::unordered_map<Node *, bool> SiblingTracker;
199     std::deque<Node *> AllNodes;
200     std::vector<uint8_t> Bytes;
201     Bytes.reserve(250'000);
202     // This leading byte is used by the reading code to detect the root node.
203     Bytes.push_back(0);
204 
205     auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
206       for (std::size_t Index = 0; Index < Children.size(); Index++) {
207         const std::unique_ptr<Node> &Child = Children[Index];
208         AllNodes.push_back(Child.get());
209         if (Index != Children.size() - 1)
210           SiblingTracker[Child.get()] = true;
211       }
212     };
213     CollectChildren(Root->Children);
214 
215     while (!AllNodes.empty()) {
216       const std::size_t Offset = Bytes.size();
217       Node *const N = AllNodes.front();
218       AllNodes.pop_front();
219 
220       assert(!N->Name.empty());
221       Offsets[N] = Offset;
222 
223       uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
224       // Single letter node are indexed in 6 bits
225       if (N->Name.size() == 1) {
226         FirstByte |= letter(N->Name[0]);
227         Bytes.push_back(FirstByte);
228       } else {
229         // Otherwise we use a 16 bits index
230         FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
231         Bytes.push_back(FirstByte);
232         auto PosInDict = Dict.find(N->Name);
233         assert(PosInDict != std::string::npos);
234         uint8_t Low = PosInDict;
235         uint8_t High = ((PosInDict >> 8) & 0xFF);
236         Bytes.push_back(High);
237         Bytes.push_back(Low);
238       }
239 
240       const bool HasSibling = SiblingTracker.count(N) != 0;
241       const bool HasChildren = N->Children.size() != 0;
242 
243       if (!!N->Value) {
244         uint32_t Value = (*(N->Value) << 3);
245         uint8_t H = ((Value >> 16) & 0xFF);
246         uint8_t M = ((Value >> 8) & 0xFF);
247         uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
248                     uint8_t(HasChildren ? 0x02 : 0);
249 
250         Bytes.push_back(H);
251         Bytes.push_back(M);
252         Bytes.push_back(L);
253 
254         if (HasChildren) {
255           ChildrenOffsets.push_back(
256               ChildrenOffset{N->Children[0].get(), Bytes.size(), true});
257           // index of the first children
258           Bytes.push_back(0x00);
259           Bytes.push_back(0x00);
260           Bytes.push_back(0x00);
261         }
262       } else {
263         // When there is no value (that's most intermediate nodes)
264         // Dispense of the 3 values bytes, and only store
265         // 1 byte to track whether the node has sibling and children
266         // + 2 bytes for the index of the first children if necessary.
267         // That index also uses bytes 0-6 of the previous byte.
268         uint8_t Byte =
269             uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
270         Bytes.push_back(Byte);
271         if (HasChildren) {
272           ChildrenOffsets.emplace_back(
273               ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false});
274           Bytes.push_back(0x00);
275           Bytes.push_back(0x00);
276         }
277       }
278       CollectChildren(N->Children);
279     }
280 
281     // Once all the nodes are in the inndex
282     // Fill the bytes we left to indicate the position
283     // of the children
284     for (const ChildrenOffset &Parent : ChildrenOffsets) {
285       const auto It = Offsets.find(Parent.FirstChild);
286       assert(It != Offsets.end());
287       std::size_t Pos = It->second;
288       if (Parent.HasValue) {
289         Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
290       } else {
291         Bytes[Parent.Offset] =
292             Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
293       }
294       Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
295       Bytes[Parent.Offset + 2] = Pos & 0xFF;
296     }
297 
298     // Add some padding so that the deserialization code
299     // doesn't try to read past the enf of the array.
300     Bytes.push_back(0);
301     Bytes.push_back(0);
302     Bytes.push_back(0);
303     Bytes.push_back(0);
304     Bytes.push_back(0);
305     Bytes.push_back(0);
306 
307     return Bytes;
308   }
309 
310 private:
collectKeys(Node * N,std::set<std::string> & Keys)311   void collectKeys(Node *N, std::set<std::string> &Keys) {
312     Keys.insert(N->Name);
313     for (const std::unique_ptr<Node> &Child : N->Children) {
314       collectKeys(Child.get(), Keys);
315     }
316   }
317 
318   // Merge sequences of 1-character nodes
319   // This greatly reduce the total number of nodes,
320   // and therefore the size of the index.
321   // When the tree gets serialized, we only have 5 bytes to store the
322   // size of a name. Overlong names (>32 characters) are therefore
323   // kep into separate nodes
compact(Node * N)324   void compact(Node *N) {
325     for (auto &&Child : N->Children) {
326       compact(Child.get());
327     }
328     if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
329         (N->Parent->Name.size() + N->Name.size() <= 32)) {
330       N->Parent->Value = N->Value;
331       N->Parent->Name += N->Name;
332       N->Parent->Children = std::move(N->Children);
333       for (std::unique_ptr<Node> &c : N->Parent->Children) {
334         c->Parent = N->Parent;
335       }
336     }
337   }
338   struct Node {
NodeTrie::Node339     Node(std::string Name, Node *Parent = nullptr)
340         : Name(Name), Parent(Parent) {}
341 
342     std::vector<std::unique_ptr<Node>> Children;
343     std::string Name;
344     Node *Parent = nullptr;
345     std::optional<char32_t> Value;
346   };
347 
348   std::unique_ptr<Node> Root = std::make_unique<Node>("");
349 };
350 
351 extern const char *UnicodeLicense;
352 
main(int argc,char ** argv)353 int main(int argc, char **argv) {
354   printf("Unicode name -> codepoint mapping generator\n"
355          "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
356          argv[0]);
357   printf("NameAliases.txt can be found at "
358          "https://unicode.org/Public/15.1.0/ucd/NameAliases.txt\n"
359          "UnicodeData.txt can be found at "
360          "https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt\n\n");
361 
362   if (argc != 4)
363     return EXIT_FAILURE;
364 
365   FILE *Out = fopen(argv[3], "w");
366   if (!Out) {
367     printf("Error creating output file.\n");
368     return EXIT_FAILURE;
369   }
370 
371   Trie T;
372   uint32_t NameCount = 0;
373   std::size_t LongestName = 0;
374   auto Entries = loadDataFiles(argv[1], argv[2]);
375   for (const std::pair<const char32_t, std::string> &Entry : Entries) {
376     char32_t Codepoint = Entry.first;
377     const std::string &Name = Entry.second;
378     // Ignore names which are not valid.
379     if (Name.empty() ||
380         !llvm::all_of(Name, [](char C) { return Letters.contains(C); })) {
381       continue;
382     }
383     printf("%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
384     T.insert(Name, Codepoint);
385     LongestName =
386         std::max(LongestName, std::size_t(llvm::count_if(Name, llvm::isAlnum)));
387     NameCount++;
388   }
389   T.compact();
390 
391   std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
392   const std::string &Dict = Data.first;
393   const std::vector<uint8_t> &Tree = Data.second;
394 
395   fprintf(Out, R"(
396 //===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
397 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
398 // See https://llvm.org/LICENSE.txt for license information.
399 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
400 //
401 //===----------------------------------------------------------------------===//
402 //
403 // This file implements mapping the name of a unicode code point to its value.
404 //
405 // This file was generated using %s.
406 // Do not edit manually.
407 //
408 //===----------------------------------------------------------------------===//
409 %s
410 
411 
412 
413 #include "llvm/Support/Compiler.h"
414 #include <cstddef>
415 #include <cstdint>
416 )",
417           argv[0], UnicodeLicense);
418 
419   fprintf(Out,
420           "namespace llvm { namespace sys { namespace unicode { \n"
421           "extern const char *UnicodeNameToCodepointDict;\n"
422           "extern const uint8_t *UnicodeNameToCodepointIndex;\n"
423           "extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
424           "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
425 
426   fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n",
427           Dict.c_str());
428 
429   fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
430           Tree.size() + 1);
431 
432   for (auto Byte : Tree) {
433     fprintf(Out, "0x%02x,", Byte);
434   }
435 
436   fprintf(Out, "0};");
437   fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = "
438                "UnicodeNameToCodepointIndex_; \n");
439   fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
440           Tree.size() + 1);
441   fprintf(Out,
442           "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
443           LongestName);
444   fprintf(Out, "\n}}}\n");
445   fclose(Out);
446   printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
447          argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
448 }
449 
450 const char *UnicodeLicense = R"(
451 /*
452 UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
453 
454 See Terms of Use <https://www.unicode.org/copyright.html>
455 for definitions of Unicode Inc.’s Data Files and Software.
456 
457 NOTICE TO USER: Carefully read the following legal agreement.
458 BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
459 DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
460 YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
461 TERMS AND CONDITIONS OF THIS AGREEMENT.
462 IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
463 THE DATA FILES OR SOFTWARE.
464 
465 COPYRIGHT AND PERMISSION NOTICE
466 
467 Copyright © 1991-2022 Unicode, Inc. All rights reserved.
468 Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
469 
470 Permission is hereby granted, free of charge, to any person obtaining
471 a copy of the Unicode data files and any associated documentation
472 (the "Data Files") or Unicode software and any associated documentation
473 (the "Software") to deal in the Data Files or Software
474 without restriction, including without limitation the rights to use,
475 copy, modify, merge, publish, distribute, and/or sell copies of
476 the Data Files or Software, and to permit persons to whom the Data Files
477 or Software are furnished to do so, provided that either
478 (a) this copyright and permission notice appear with all copies
479 of the Data Files or Software, or
480 (b) this copyright and permission notice appear in associated
481 Documentation.
482 
483 THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
484 ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
485 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
486 NONINFRINGEMENT OF THIRD PARTY RIGHTS.
487 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
488 NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
489 DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
490 DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
491 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
492 PERFORMANCE OF THE DATA FILES OR SOFTWARE.
493 
494 Except as contained in this notice, the name of a copyright holder
495 shall not be used in advertising or otherwise to promote the sale,
496 use or other dealings in these Data Files or Software without prior
497 written authorization of the copyright holder.
498 */
499 )";
500