1 //===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp 10 // using UnicodeData.txt and NameAliases.txt available at 11 // https://unicode.org/Public/15.0.0/ucd/ 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/ADT/STLExtras.h" 15 #include "llvm/ADT/StringExtras.h" 16 #include "llvm/ADT/StringRef.h" 17 #include <algorithm> 18 #include <array> 19 #include <deque> 20 #include <fstream> 21 #include <memory> 22 #include <optional> 23 #include <set> 24 #include <string> 25 #include <unordered_map> 26 #include <utility> 27 #include <vector> 28 29 static const llvm::StringRef Letters = 30 " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; 31 32 // Collect names UnicodeData.txt and AliasNames.txt 33 // There may be multiple names per code points. 34 static std::unordered_multimap<char32_t, std::string> 35 loadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) { 36 std::unordered_multimap<char32_t, std::string> CollectedCharacters; 37 auto FromFile = [&](const std::string &File, bool IsAliasFile = false) { 38 std::ifstream InputFile(File); 39 for (std::string Line; getline(InputFile, Line);) { 40 if (Line.empty() || !isxdigit(Line[0])) 41 continue; 42 auto FirstSemiPos = Line.find(';'); 43 if (FirstSemiPos == std::string::npos) 44 continue; 45 auto SecondSemiPos = Line.find(';', FirstSemiPos + 1); 46 if (SecondSemiPos == std::string::npos) 47 continue; 48 unsigned long long CodePoint; 49 if (llvm::getAsUnsignedInteger( 50 llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) { 51 continue; 52 } 53 54 std::string Name = 55 Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1); 56 57 if (!Name.empty() && Name[0] == '<') { 58 // Ignore ranges of characters, as their name is either absent or 59 // generated. 60 continue; 61 } 62 63 // Some aliases are ignored for compatibility with C++ 64 if (IsAliasFile) { 65 std::string Kind = Line.substr(SecondSemiPos + 1); 66 if (Kind != "control" && Kind != "correction" && Kind != "alternate") 67 continue; 68 } 69 70 auto InsertUnique = [&](char32_t CP, std::string Name) { 71 auto It = CollectedCharacters.find(CP); 72 while (It != std::end(CollectedCharacters) && It->first == CP) { 73 if (It->second == Name) 74 return; 75 ++It; 76 } 77 CollectedCharacters.insert({CP, std::move(Name)}); 78 }; 79 InsertUnique(CodePoint, std::move(Name)); 80 } 81 }; 82 83 FromFile(NamesFile); 84 FromFile(AliasesFile, true); 85 return CollectedCharacters; 86 } 87 88 class Trie { 89 struct Node; 90 91 public: 92 // When inserting named codepoint 93 // We create a node per character in the name. 94 // SPARKLE becomes S <- P <- A <- R <- K <- L <- E 95 // Once all characters are inserted, the tree is compacted 96 void insert(llvm::StringRef Name, char32_t Codepoint) { 97 Node *N = Root.get(); 98 bool IsBeforeMedial = false; 99 for (auto ChIt = Name.begin(); ChIt != Name.end(); 100 ChIt += (IsBeforeMedial ? 3 : 1)) { 101 char Ch = *ChIt; 102 assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name"); 103 104 std::string Label(1, Ch); 105 106 // We need to ensure a node never ends or starts by 107 // a medial hyphen as this would break the 108 // loose matching algorithm. 109 IsBeforeMedial = llvm::isAlnum(Ch) && ChIt + 1 != Name.end() && 110 *(ChIt + 1) == '-' && ChIt + 2 != Name.end() && 111 llvm::isAlnum(*(ChIt + 2)); 112 if (IsBeforeMedial) 113 Label.assign(ChIt, ChIt + 3); 114 115 auto It = llvm::find_if(N->Children, 116 [&](const auto &C) { return C->Name == Label; }); 117 if (It == N->Children.end()) { 118 It = N->Children.insert(It, std::make_unique<Node>(Label, N)); 119 } 120 N = It->get(); 121 } 122 N->Value = Codepoint; 123 } 124 125 void compact() { compact(Root.get()); } 126 127 // This creates 2 arrays of bytes from the tree: 128 // A serialized dictionary of node labels, 129 // And the nodes themselves. 130 // The name of each label is found by indexing into the dictionary. 131 // The longest names are inserted first into the dictionary, 132 // in the hope it will contain shorter labels as substring, 133 // thereby reducing duplication. 134 // We could theorically be more clever by trying to minimizing the size 135 // of the dictionary. 136 std::pair<std::string, std::vector<uint8_t>> serialize() { 137 std::set<std::string> Names = this->getNameFragments(); 138 std::vector<std::string> Sorted(Names.begin(), Names.end()); 139 llvm::sort(Sorted, [](const auto &a, const auto &b) { 140 return a.size() > b.size(); 141 }); 142 std::string Dict(Letters.begin(), Letters.end()); 143 Dict.reserve(50000); 144 for (const std::string &Name : Sorted) { 145 if (Name.size() <= 1) 146 continue; 147 if (Dict.find(Name) != std::string::npos) 148 continue; 149 Dict += Name; 150 } 151 152 if (Dict.size() >= std::numeric_limits<uint16_t>::max()) { 153 fprintf(stderr, "Dictionary too big to be serialized"); 154 exit(1); 155 } 156 157 auto Bytes = dumpIndex(Dict); 158 return {Dict, Bytes}; 159 } 160 161 std::set<std::string> getNameFragments() { 162 std::set<std::string> Keys; 163 collectKeys(Root.get(), Keys); 164 return Keys; 165 } 166 167 // Maps a valid char in an Unicode character name 168 // To a 6 bits index. 169 static uint8_t letter(char C) { 170 auto Pos = Letters.find(C); 171 assert(Pos != std::string::npos && 172 "Invalid letter in Unicode character name"); 173 return Pos; 174 } 175 176 // clang-format off 177 // +================+============+======================+=============+========+===+==============+===============+ 178 // | 0 | 1 | 2-7 (6) | 8-23 | 24-44 | | 46 | 47 | 179 // +================+============+======================+=============+========+===+==============+===============+ 180 // | Has Value | Has Long Name | Letter OR Name Size | Dict Index | Value | | Has Sibling | Has Children | 181 // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+ 182 // clang-format on 183 184 std::vector<uint8_t> dumpIndex(const std::string &Dict) { 185 struct ChildrenOffset { 186 Node *FirstChild; 187 std::size_t Offset; 188 bool HasValue; 189 }; 190 191 // Keep track of the start of each node 192 // position in the serialized data. 193 std::unordered_map<Node *, int32_t> Offsets; 194 195 // Keep track of where to write the index 196 // of the first children 197 std::vector<ChildrenOffset> ChildrenOffsets; 198 std::unordered_map<Node *, bool> SiblingTracker; 199 std::deque<Node *> AllNodes; 200 std::vector<uint8_t> Bytes; 201 Bytes.reserve(250'000); 202 // This leading byte is used by the reading code to detect the root node. 203 Bytes.push_back(0); 204 205 auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) { 206 for (std::size_t Index = 0; Index < Children.size(); Index++) { 207 const std::unique_ptr<Node> &Child = Children[Index]; 208 AllNodes.push_back(Child.get()); 209 if (Index != Children.size() - 1) 210 SiblingTracker[Child.get()] = true; 211 } 212 }; 213 CollectChildren(Root->Children); 214 215 while (!AllNodes.empty()) { 216 const std::size_t Offset = Bytes.size(); 217 Node *const N = AllNodes.front(); 218 AllNodes.pop_front(); 219 220 assert(!N->Name.empty()); 221 Offsets[N] = Offset; 222 223 uint8_t FirstByte = (!!N->Value) ? 0x80 : 0; 224 // Single letter node are indexed in 6 bits 225 if (N->Name.size() == 1) { 226 FirstByte |= letter(N->Name[0]); 227 Bytes.push_back(FirstByte); 228 } else { 229 // Otherwise we use a 16 bits index 230 FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40; 231 Bytes.push_back(FirstByte); 232 auto PosInDict = Dict.find(N->Name); 233 assert(PosInDict != std::string::npos); 234 uint8_t Low = PosInDict; 235 uint8_t High = ((PosInDict >> 8) & 0xFF); 236 Bytes.push_back(High); 237 Bytes.push_back(Low); 238 } 239 240 const bool HasSibling = SiblingTracker.count(N) != 0; 241 const bool HasChildren = N->Children.size() != 0; 242 243 if (!!N->Value) { 244 uint32_t Value = (*(N->Value) << 3); 245 uint8_t H = ((Value >> 16) & 0xFF); 246 uint8_t M = ((Value >> 8) & 0xFF); 247 uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) | 248 uint8_t(HasChildren ? 0x02 : 0); 249 250 Bytes.push_back(H); 251 Bytes.push_back(M); 252 Bytes.push_back(L); 253 254 if (HasChildren) { 255 ChildrenOffsets.push_back( 256 ChildrenOffset{N->Children[0].get(), Bytes.size(), true}); 257 // index of the first children 258 Bytes.push_back(0x00); 259 Bytes.push_back(0x00); 260 Bytes.push_back(0x00); 261 } 262 } else { 263 // When there is no value (that's most intermediate nodes) 264 // Dispense of the 3 values bytes, and only store 265 // 1 byte to track whether the node has sibling and children 266 // + 2 bytes for the index of the first children if necessary. 267 // That index also uses bytes 0-6 of the previous byte. 268 uint8_t Byte = 269 uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0); 270 Bytes.push_back(Byte); 271 if (HasChildren) { 272 ChildrenOffsets.emplace_back( 273 ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false}); 274 Bytes.push_back(0x00); 275 Bytes.push_back(0x00); 276 } 277 } 278 CollectChildren(N->Children); 279 } 280 281 // Once all the nodes are in the inndex 282 // Fill the bytes we left to indicate the position 283 // of the children 284 for (const ChildrenOffset &Parent : ChildrenOffsets) { 285 const auto It = Offsets.find(Parent.FirstChild); 286 assert(It != Offsets.end()); 287 std::size_t Pos = It->second; 288 if (Parent.HasValue) { 289 Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF); 290 } else { 291 Bytes[Parent.Offset] = 292 Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF); 293 } 294 Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF); 295 Bytes[Parent.Offset + 2] = Pos & 0xFF; 296 } 297 298 // Add some padding so that the deserialization code 299 // doesn't try to read past the enf of the array. 300 Bytes.push_back(0); 301 Bytes.push_back(0); 302 Bytes.push_back(0); 303 Bytes.push_back(0); 304 Bytes.push_back(0); 305 Bytes.push_back(0); 306 307 return Bytes; 308 } 309 310 private: 311 void collectKeys(Node *N, std::set<std::string> &Keys) { 312 Keys.insert(N->Name); 313 for (const std::unique_ptr<Node> &Child : N->Children) { 314 collectKeys(Child.get(), Keys); 315 } 316 } 317 318 // Merge sequences of 1-character nodes 319 // This greatly reduce the total number of nodes, 320 // and therefore the size of the index. 321 // When the tree gets serialized, we only have 5 bytes to store the 322 // size of a name. Overlong names (>32 characters) are therefore 323 // kep into separate nodes 324 void compact(Node *N) { 325 for (auto &&Child : N->Children) { 326 compact(Child.get()); 327 } 328 if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value && 329 (N->Parent->Name.size() + N->Name.size() <= 32)) { 330 N->Parent->Value = N->Value; 331 N->Parent->Name += N->Name; 332 N->Parent->Children = std::move(N->Children); 333 for (std::unique_ptr<Node> &c : N->Parent->Children) { 334 c->Parent = N->Parent; 335 } 336 } 337 } 338 struct Node { 339 Node(std::string Name, Node *Parent = nullptr) 340 : Name(Name), Parent(Parent) {} 341 342 std::vector<std::unique_ptr<Node>> Children; 343 std::string Name; 344 Node *Parent = nullptr; 345 std::optional<char32_t> Value; 346 }; 347 348 std::unique_ptr<Node> Root = std::make_unique<Node>(""); 349 }; 350 351 extern const char *UnicodeLicense; 352 353 int main(int argc, char **argv) { 354 printf("Unicode name -> codepoint mapping generator\n" 355 "Usage: %s UnicodeData.txt NameAliases.txt output\n\n", 356 argv[0]); 357 printf("NameAliases.txt can be found at " 358 "https://unicode.org/Public/15.0.0/ucd/NameAliases.txt\n" 359 "UnicodeData.txt can be found at " 360 "https://unicode.org/Public/15.0.0/ucd/UnicodeData.txt\n\n"); 361 362 if (argc != 4) 363 return EXIT_FAILURE; 364 365 FILE *Out = fopen(argv[3], "w"); 366 if (!Out) { 367 printf("Error creating output file.\n"); 368 return EXIT_FAILURE; 369 } 370 371 Trie T; 372 uint32_t NameCount = 0; 373 std::size_t LongestName = 0; 374 auto Entries = loadDataFiles(argv[1], argv[2]); 375 for (const std::pair<const char32_t, std::string> &Entry : Entries) { 376 char32_t Codepoint = Entry.first; 377 const std::string &Name = Entry.second; 378 // Ignore names which are not valid. 379 if (Name.empty() || 380 !llvm::all_of(Name, [](char C) { return Letters.contains(C); })) { 381 continue; 382 } 383 printf("%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str()); 384 T.insert(Name, Codepoint); 385 LongestName = 386 std::max(LongestName, std::size_t(llvm::count_if(Name, llvm::isAlnum))); 387 NameCount++; 388 } 389 T.compact(); 390 391 std::pair<std::string, std::vector<uint8_t>> Data = T.serialize(); 392 const std::string &Dict = Data.first; 393 const std::vector<uint8_t> &Tree = Data.second; 394 395 fprintf(Out, R"( 396 //===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===// 397 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 398 // See https://llvm.org/LICENSE.txt for license information. 399 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 400 // 401 //===----------------------------------------------------------------------===// 402 // 403 // This file implements mapping the name of a unicode code point to its value. 404 // 405 // This file was generated using %s. 406 // Do not edit manually. 407 // 408 //===----------------------------------------------------------------------===// 409 %s 410 411 412 413 #include "llvm/Support/Compiler.h" 414 #include <cstddef> 415 #include <cstdint> 416 )", 417 argv[0], UnicodeLicense); 418 419 fprintf(Out, 420 "namespace llvm { namespace sys { namespace unicode { \n" 421 "extern const char *UnicodeNameToCodepointDict;\n" 422 "extern const uint8_t *UnicodeNameToCodepointIndex;\n" 423 "extern const std::size_t UnicodeNameToCodepointIndexSize;\n" 424 "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n"); 425 426 fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n", 427 Dict.c_str()); 428 429 fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n", 430 Tree.size() + 1); 431 432 for (auto Byte : Tree) { 433 fprintf(Out, "0x%02x,", Byte); 434 } 435 436 fprintf(Out, "0};"); 437 fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = " 438 "UnicodeNameToCodepointIndex_; \n"); 439 fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n", 440 Tree.size() + 1); 441 fprintf(Out, 442 "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n", 443 LongestName); 444 fprintf(Out, "\n}}}\n"); 445 fclose(Out); 446 printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n", 447 argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0); 448 } 449 450 const char *UnicodeLicense = R"( 451 /* 452 UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 453 454 See Terms of Use <https://www.unicode.org/copyright.html> 455 for definitions of Unicode Inc.’s Data Files and Software. 456 457 NOTICE TO USER: Carefully read the following legal agreement. 458 BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 459 DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 460 YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 461 TERMS AND CONDITIONS OF THIS AGREEMENT. 462 IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 463 THE DATA FILES OR SOFTWARE. 464 465 COPYRIGHT AND PERMISSION NOTICE 466 467 Copyright © 1991-2022 Unicode, Inc. All rights reserved. 468 Distributed under the Terms of Use in https://www.unicode.org/copyright.html. 469 470 Permission is hereby granted, free of charge, to any person obtaining 471 a copy of the Unicode data files and any associated documentation 472 (the "Data Files") or Unicode software and any associated documentation 473 (the "Software") to deal in the Data Files or Software 474 without restriction, including without limitation the rights to use, 475 copy, modify, merge, publish, distribute, and/or sell copies of 476 the Data Files or Software, and to permit persons to whom the Data Files 477 or Software are furnished to do so, provided that either 478 (a) this copyright and permission notice appear with all copies 479 of the Data Files or Software, or 480 (b) this copyright and permission notice appear in associated 481 Documentation. 482 483 THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 484 ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 485 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 486 NONINFRINGEMENT OF THIRD PARTY RIGHTS. 487 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 488 NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 489 DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 490 DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 491 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 492 PERFORMANCE OF THE DATA FILES OR SOFTWARE. 493 494 Except as contained in this notice, the name of a copyright holder 495 shall not be used in advertising or otherwise to promote the sale, 496 use or other dealings in these Data Files or Software without prior 497 written authorization of the copyright holder. 498 */ 499 )"; 500