utils/UnicodeData/UnicodeNameMappingGenerator.cpp

*d415bd75Srobert//===--- UnicodeNameMappingGenerator.cpp - Unicode name data generator ---===//
*d415bd75Srobert//
*d415bd75Srobert// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
*d415bd75Srobert// See https://llvm.org/LICENSE.txt for license information.
*d415bd75Srobert// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*d415bd75Srobert//
*d415bd75Srobert//===----------------------------------------------------------------------===//
*d415bd75Srobert//
*d415bd75Srobert// This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
*d415bd75Srobert// using UnicodeData.txt and NameAliases.txt available at
*d415bd75Srobert// https://unicode.org/Public/15.0.0/ucd/
*d415bd75Srobert//===----------------------------------------------------------------------===//
*d415bd75Srobert
*d415bd75Srobert#include "llvm/ADT/STLExtras.h"
*d415bd75Srobert#include "llvm/ADT/StringExtras.h"
*d415bd75Srobert#include "llvm/ADT/StringRef.h"
*d415bd75Srobert#include <algorithm>
*d415bd75Srobert#include <array>
*d415bd75Srobert#include <deque>
*d415bd75Srobert#include <fstream>
*d415bd75Srobert#include <memory>
*d415bd75Srobert#include <optional>
*d415bd75Srobert#include <set>
*d415bd75Srobert#include <string>
*d415bd75Srobert#include <unordered_map>
*d415bd75Srobert#include <utility>
*d415bd75Srobert#include <vector>
*d415bd75Srobert
*d415bd75Srobertstatic const llvm::StringRef Letters =
*d415bd75Srobert    " _-ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
*d415bd75Srobert
*d415bd75Srobert// Collect names UnicodeData.txt and AliasNames.txt
*d415bd75Srobert// There may be multiple names per code points.
*d415bd75Srobertstatic std::unordered_multimap<char32_t, std::string>
*d415bd75SrobertloadDataFiles(const std::string &NamesFile, const std::string &AliasesFile) {
*d415bd75Srobert  std::unordered_multimap<char32_t, std::string> CollectedCharacters;
*d415bd75Srobert  auto FromFile = [&](const std::string &File, bool IsAliasFile = false) {
*d415bd75Srobert    std::ifstream InputFile(File);
*d415bd75Srobert    for (std::string Line; getline(InputFile, Line);) {
*d415bd75Srobert      if (Line.empty() || !isxdigit(Line[0]))
*d415bd75Srobert        continue;
*d415bd75Srobert      auto FirstSemiPos = Line.find(';');
*d415bd75Srobert      if (FirstSemiPos == std::string::npos)
*d415bd75Srobert        continue;
*d415bd75Srobert      auto SecondSemiPos = Line.find(';', FirstSemiPos + 1);
*d415bd75Srobert      if (SecondSemiPos == std::string::npos)
*d415bd75Srobert        continue;
*d415bd75Srobert      unsigned long long CodePoint;
*d415bd75Srobert      if (llvm::getAsUnsignedInteger(
*d415bd75Srobert              llvm::StringRef(Line.c_str(), FirstSemiPos), 16, CodePoint)) {
*d415bd75Srobert        continue;
*d415bd75Srobert      }
*d415bd75Srobert
*d415bd75Srobert      std::string Name =
*d415bd75Srobert          Line.substr(FirstSemiPos + 1, SecondSemiPos - FirstSemiPos - 1);
*d415bd75Srobert
*d415bd75Srobert      if (!Name.empty() && Name[0] == '<') {
*d415bd75Srobert        // Ignore ranges of characters, as their name is either absent or
*d415bd75Srobert        // generated.
*d415bd75Srobert        continue;
*d415bd75Srobert      }
*d415bd75Srobert
*d415bd75Srobert      // Some aliases are ignored for compatibility with C++
*d415bd75Srobert      if (IsAliasFile) {
*d415bd75Srobert        std::string Kind = Line.substr(SecondSemiPos + 1);
*d415bd75Srobert        if (Kind != "control" && Kind != "correction" && Kind != "alternate")
*d415bd75Srobert          continue;
*d415bd75Srobert      }
*d415bd75Srobert
*d415bd75Srobert      auto InsertUnique = [&](char32_t CP, std::string Name) {
*d415bd75Srobert        auto It = CollectedCharacters.find(CP);
*d415bd75Srobert        while (It != std::end(CollectedCharacters) && It->first == CP) {
*d415bd75Srobert          if (It->second == Name)
*d415bd75Srobert            return;
*d415bd75Srobert          ++It;
*d415bd75Srobert        }
*d415bd75Srobert        CollectedCharacters.insert({CP, std::move(Name)});
*d415bd75Srobert      };
*d415bd75Srobert      InsertUnique(CodePoint, std::move(Name));
*d415bd75Srobert    }
*d415bd75Srobert  };
*d415bd75Srobert
*d415bd75Srobert  FromFile(NamesFile);
*d415bd75Srobert  FromFile(AliasesFile, true);
*d415bd75Srobert  return CollectedCharacters;
*d415bd75Srobert}
*d415bd75Srobert
*d415bd75Srobertclass Trie {
*d415bd75Srobert  struct Node;
*d415bd75Srobert
*d415bd75Srobertpublic:
*d415bd75Srobert  // When inserting named codepoint
*d415bd75Srobert  // We create a node per character in the name.
*d415bd75Srobert  // SPARKLE becomes S <- P <- A <- R <- K <- L <- E
*d415bd75Srobert  // Once all  characters are inserted, the tree is compacted
*d415bd75Srobert  void insert(llvm::StringRef Name, char32_t Codepoint) {
*d415bd75Srobert    Node *N = Root.get();
*d415bd75Srobert    for (auto Ch : Name) {
*d415bd75Srobert      std::string Label(1, Ch);
*d415bd75Srobert      auto It = llvm::find_if(N->Children,
*d415bd75Srobert                              [&](const auto &C) { return C->Name == Label; });
*d415bd75Srobert      if (It == N->Children.end()) {
*d415bd75Srobert        It = N->Children.insert(It, std::make_unique<Node>(Label, N));
*d415bd75Srobert      }
*d415bd75Srobert      N = It->get();
*d415bd75Srobert    }
*d415bd75Srobert    N->Value = Codepoint;
*d415bd75Srobert  }
*d415bd75Srobert
*d415bd75Srobert  void compact() { compact(Root.get()); }
*d415bd75Srobert
*d415bd75Srobert  // This creates 2 arrays of bytes from the tree:
*d415bd75Srobert  // A serialized dictionary of node labels,
*d415bd75Srobert  // And the nodes themselves.
*d415bd75Srobert  // The name of each label is found by indexing into the dictionary.
*d415bd75Srobert  // The longest names are inserted first into the dictionary,
*d415bd75Srobert  // in the hope it will contain shorter labels as substring,
*d415bd75Srobert  // thereby reducing duplication.
*d415bd75Srobert  // We could theorically be more clever by trying to minimizing the size
*d415bd75Srobert  // of the dictionary.
*d415bd75Srobert  std::pair<std::string, std::vector<uint8_t>> serialize() {
*d415bd75Srobert    std::set<std::string> Names = this->getNameFragments();
*d415bd75Srobert    std::vector<std::string> Sorted(Names.begin(), Names.end());
*d415bd75Srobert    llvm::sort(Sorted, [](const auto &a, const auto &b) {
*d415bd75Srobert      return a.size() > b.size();
*d415bd75Srobert    });
*d415bd75Srobert    std::string Dict(Letters.begin(), Letters.end());
*d415bd75Srobert    Dict.reserve(50000);
*d415bd75Srobert    for (const std::string &Name : Sorted) {
*d415bd75Srobert      if (Name.size() <= 1)
*d415bd75Srobert        continue;
*d415bd75Srobert      if (Dict.find(Name) != std::string::npos)
*d415bd75Srobert        continue;
*d415bd75Srobert      Dict += Name;
*d415bd75Srobert    }
*d415bd75Srobert
*d415bd75Srobert    if (Dict.size() >= std::numeric_limits<uint16_t>::max()) {
*d415bd75Srobert      fprintf(stderr, "Dictionary too big  to be serialized");
*d415bd75Srobert      exit(1);
*d415bd75Srobert    }
*d415bd75Srobert
*d415bd75Srobert    auto Bytes = dumpIndex(Dict);
*d415bd75Srobert    return {Dict, Bytes};
*d415bd75Srobert  }
*d415bd75Srobert
*d415bd75Srobert  std::set<std::string> getNameFragments() {
*d415bd75Srobert    std::set<std::string> Keys;
*d415bd75Srobert    collectKeys(Root.get(), Keys);
*d415bd75Srobert    return Keys;
*d415bd75Srobert  }
*d415bd75Srobert
*d415bd75Srobert  // Maps a valid char in an Unicode character name
*d415bd75Srobert  // To a 6 bits index.
*d415bd75Srobert  static uint8_t letter(char C) {
*d415bd75Srobert    auto Pos = Letters.find(C);
*d415bd75Srobert    assert(Pos != std::string::npos &&
*d415bd75Srobert           "Invalid letter in Unicode character name");
*d415bd75Srobert    return Pos;
*d415bd75Srobert  }
*d415bd75Srobert
*d415bd75Srobert  // clang-format off
*d415bd75Srobert  // +================+============+======================+=============+========+===+==============+===============+
*d415bd75Srobert  // | 0          | 1             | 2-7 (6)              | 8-23        | 24-44  |    | 46           | 47            |
*d415bd75Srobert  // +================+============+======================+=============+========+===+==============+===============+
*d415bd75Srobert  // | Has Value |  Has Long Name | Letter OR Name Size  | Dict Index  | Value  |    | Has Sibling  | Has Children  |
*d415bd75Srobert  // +----------------+------------+----------------------+-------------+--------+---+--------------+---------------+
*d415bd75Srobert  // clang-format on
*d415bd75Srobert
*d415bd75Srobert  std::vector<uint8_t> dumpIndex(const std::string &Dict) {
*d415bd75Srobert    struct ChildrenOffset {
*d415bd75Srobert      Node *FirstChild;
*d415bd75Srobert      std::size_t Offset;
*d415bd75Srobert      bool HasValue;
*d415bd75Srobert    };
*d415bd75Srobert
*d415bd75Srobert    // Keep track of the start of each node
*d415bd75Srobert    // position in the serialized data.
*d415bd75Srobert    std::unordered_map<Node *, int32_t> Offsets;
*d415bd75Srobert
*d415bd75Srobert    // Keep track of where to write the index
*d415bd75Srobert    // of the first children
*d415bd75Srobert    std::vector<ChildrenOffset> ChildrenOffsets;
*d415bd75Srobert    std::unordered_map<Node *, bool> SiblingTracker;
*d415bd75Srobert    std::deque<Node *> AllNodes;
*d415bd75Srobert    std::vector<uint8_t> Bytes;
*d415bd75Srobert    Bytes.reserve(250'000);
*d415bd75Srobert    // This leading byte is used by the reading code to detect the root node.
*d415bd75Srobert    Bytes.push_back(0);
*d415bd75Srobert
*d415bd75Srobert    auto CollectChildren = [&SiblingTracker, &AllNodes](const auto &Children) {
*d415bd75Srobert      for (std::size_t Index = 0; Index < Children.size(); Index++) {
*d415bd75Srobert        const std::unique_ptr<Node> &Child = Children[Index];
*d415bd75Srobert        AllNodes.push_back(Child.get());
*d415bd75Srobert        if (Index != Children.size() - 1)
*d415bd75Srobert          SiblingTracker[Child.get()] = true;
*d415bd75Srobert      }
*d415bd75Srobert    };
*d415bd75Srobert    CollectChildren(Root->Children);
*d415bd75Srobert
*d415bd75Srobert    while (!AllNodes.empty()) {
*d415bd75Srobert      const std::size_t Offset = Bytes.size();
*d415bd75Srobert      Node *const N = AllNodes.front();
*d415bd75Srobert      AllNodes.pop_front();
*d415bd75Srobert
*d415bd75Srobert      assert(!N->Name.empty());
*d415bd75Srobert      Offsets[N] = Offset;
*d415bd75Srobert
*d415bd75Srobert      uint8_t FirstByte = (!!N->Value) ? 0x80 : 0;
*d415bd75Srobert      // Single letter node are indexed in 6 bits
*d415bd75Srobert      if (N->Name.size() == 1) {
*d415bd75Srobert        FirstByte |= letter(N->Name[0]);
*d415bd75Srobert        Bytes.push_back(FirstByte);
*d415bd75Srobert      } else {
*d415bd75Srobert        // Otherwise we use a 16 bits index
*d415bd75Srobert        FirstByte = FirstByte | uint8_t(N->Name.size()) | 0x40;
*d415bd75Srobert        Bytes.push_back(FirstByte);
*d415bd75Srobert        auto PosInDict = Dict.find(N->Name);
*d415bd75Srobert        assert(PosInDict != std::string::npos);
*d415bd75Srobert        uint8_t Low = PosInDict;
*d415bd75Srobert        uint8_t High = ((PosInDict >> 8) & 0xFF);
*d415bd75Srobert        Bytes.push_back(High);
*d415bd75Srobert        Bytes.push_back(Low);
*d415bd75Srobert      }
*d415bd75Srobert
*d415bd75Srobert      const bool HasSibling = SiblingTracker.count(N) != 0;
*d415bd75Srobert      const bool HasChildren = N->Children.size() != 0;
*d415bd75Srobert
*d415bd75Srobert      if (!!N->Value) {
*d415bd75Srobert        uint32_t Value = (*(N->Value) << 3);
*d415bd75Srobert        uint8_t H = ((Value >> 16) & 0xFF);
*d415bd75Srobert        uint8_t M = ((Value >> 8) & 0xFF);
*d415bd75Srobert        uint8_t L = (Value & 0xFF) | uint8_t(HasSibling ? 0x01 : 0) |
*d415bd75Srobert                    uint8_t(HasChildren ? 0x02 : 0);
*d415bd75Srobert
*d415bd75Srobert        Bytes.push_back(H);
*d415bd75Srobert        Bytes.push_back(M);
*d415bd75Srobert        Bytes.push_back(L);
*d415bd75Srobert
*d415bd75Srobert        if (HasChildren) {
*d415bd75Srobert          ChildrenOffsets.push_back(
*d415bd75Srobert              ChildrenOffset{N->Children[0].get(), Bytes.size(), true});
*d415bd75Srobert          // index of the first children
*d415bd75Srobert          Bytes.push_back(0x00);
*d415bd75Srobert          Bytes.push_back(0x00);
*d415bd75Srobert          Bytes.push_back(0x00);
*d415bd75Srobert        }
*d415bd75Srobert      } else {
*d415bd75Srobert        // When there is no value (that's most intermediate nodes)
*d415bd75Srobert        // Dispense of the 3 values bytes, and only store
*d415bd75Srobert        // 1 byte to track whether the node has sibling and children
*d415bd75Srobert        // + 2 bytes for the index of the first children if necessary.
*d415bd75Srobert        // That index also uses bytes 0-6 of the previous byte.
*d415bd75Srobert        uint8_t Byte =
*d415bd75Srobert            uint8_t(HasSibling ? 0x80 : 0) | uint8_t(HasChildren ? 0x40 : 0);
*d415bd75Srobert        Bytes.push_back(Byte);
*d415bd75Srobert        if (HasChildren) {
*d415bd75Srobert          ChildrenOffsets.emplace_back(
*d415bd75Srobert              ChildrenOffset{N->Children[0].get(), Bytes.size() - 1, false});
*d415bd75Srobert          Bytes.push_back(0x00);
*d415bd75Srobert          Bytes.push_back(0x00);
*d415bd75Srobert        }
*d415bd75Srobert      }
*d415bd75Srobert      CollectChildren(N->Children);
*d415bd75Srobert    }
*d415bd75Srobert
*d415bd75Srobert    // Once all the nodes are in the inndex
*d415bd75Srobert    // Fill the bytes we left to indicate the position
*d415bd75Srobert    // of the children
*d415bd75Srobert    for (const ChildrenOffset &Parent : ChildrenOffsets) {
*d415bd75Srobert      const auto It = Offsets.find(Parent.FirstChild);
*d415bd75Srobert      assert(It != Offsets.end());
*d415bd75Srobert      std::size_t Pos = It->second;
*d415bd75Srobert      if (Parent.HasValue) {
*d415bd75Srobert        Bytes[Parent.Offset] = ((Pos >> 16) & 0xFF);
*d415bd75Srobert      } else {
*d415bd75Srobert        Bytes[Parent.Offset] =
*d415bd75Srobert            Bytes[Parent.Offset] | uint8_t((Pos >> 16) & 0xFF);
*d415bd75Srobert      }
*d415bd75Srobert      Bytes[Parent.Offset + 1] = ((Pos >> 8) & 0xFF);
*d415bd75Srobert      Bytes[Parent.Offset + 2] = Pos & 0xFF;
*d415bd75Srobert    }
*d415bd75Srobert
*d415bd75Srobert    // Add some padding so that the deserialization code
*d415bd75Srobert    // doesn't try to read past the enf of the array.
*d415bd75Srobert    Bytes.push_back(0);
*d415bd75Srobert    Bytes.push_back(0);
*d415bd75Srobert    Bytes.push_back(0);
*d415bd75Srobert    Bytes.push_back(0);
*d415bd75Srobert    Bytes.push_back(0);
*d415bd75Srobert    Bytes.push_back(0);
*d415bd75Srobert
*d415bd75Srobert    return Bytes;
*d415bd75Srobert  }
*d415bd75Srobert
*d415bd75Srobertprivate:
*d415bd75Srobert  void collectKeys(Node *N, std::set<std::string> &Keys) {
*d415bd75Srobert    Keys.insert(N->Name);
*d415bd75Srobert    for (const std::unique_ptr<Node> &Child : N->Children) {
*d415bd75Srobert      collectKeys(Child.get(), Keys);
*d415bd75Srobert    }
*d415bd75Srobert  }
*d415bd75Srobert
*d415bd75Srobert  // Merge sequences of 1-character nodes
*d415bd75Srobert  // This greatly reduce the total number of nodes,
*d415bd75Srobert  // and therefore the size of the index.
*d415bd75Srobert  // When the tree gets serialized, we only have 5 bytes to store the
*d415bd75Srobert  // size of a name. Overlong names (>32 characters) are therefore
*d415bd75Srobert  // kep into separate nodes
*d415bd75Srobert  void compact(Node *N) {
*d415bd75Srobert    for (auto &&Child : N->Children) {
*d415bd75Srobert      compact(Child.get());
*d415bd75Srobert    }
*d415bd75Srobert    if (N->Parent && N->Parent->Children.size() == 1 && !N->Parent->Value &&
*d415bd75Srobert        (N->Parent->Name.size() + N->Name.size() <= 32)) {
*d415bd75Srobert      N->Parent->Value = N->Value;
*d415bd75Srobert      N->Parent->Name += N->Name;
*d415bd75Srobert      N->Parent->Children = std::move(N->Children);
*d415bd75Srobert      for (std::unique_ptr<Node> &c : N->Parent->Children) {
*d415bd75Srobert        c->Parent = N->Parent;
*d415bd75Srobert      }
*d415bd75Srobert    }
*d415bd75Srobert  }
*d415bd75Srobert  struct Node {
*d415bd75Srobert    Node(std::string Name, Node *Parent = nullptr)
*d415bd75Srobert        : Name(Name), Parent(Parent) {}
*d415bd75Srobert
*d415bd75Srobert    std::vector<std::unique_ptr<Node>> Children;
*d415bd75Srobert    std::string Name;
*d415bd75Srobert    Node *Parent = nullptr;
*d415bd75Srobert    std::optional<char32_t> Value;
*d415bd75Srobert  };
*d415bd75Srobert
*d415bd75Srobert  std::unique_ptr<Node> Root = std::make_unique<Node>("");
*d415bd75Srobert};
*d415bd75Srobert
*d415bd75Srobertextern const char *UnicodeLicense;
*d415bd75Srobert
*d415bd75Srobertint main(int argc, char **argv) {
*d415bd75Srobert  printf("Unicode name -> codepoint mapping generator\n"
*d415bd75Srobert         "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
*d415bd75Srobert         argv[0]);
*d415bd75Srobert  printf("NameAliases.txt can be found at "
*d415bd75Srobert         "https://unicode.org/Public/15.0.0/ucd/NameAliases.txt\n"
*d415bd75Srobert         "UnicodeData.txt can be found at "
*d415bd75Srobert         "https://unicode.org/Public/15.0.0/ucd/UnicodeData.txt\n\n");
*d415bd75Srobert
*d415bd75Srobert  if (argc != 4)
*d415bd75Srobert    return EXIT_FAILURE;
*d415bd75Srobert
*d415bd75Srobert  FILE *Out = fopen(argv[3], "w");
*d415bd75Srobert  if (!Out) {
*d415bd75Srobert    printf("Error creating output file.\n");
*d415bd75Srobert    return EXIT_FAILURE;
*d415bd75Srobert  }
*d415bd75Srobert
*d415bd75Srobert  Trie T;
*d415bd75Srobert  uint32_t NameCount = 0;
*d415bd75Srobert  std::size_t LongestName = 0;
*d415bd75Srobert  auto Entries = loadDataFiles(argv[1], argv[2]);
*d415bd75Srobert  for (const std::pair<const char32_t, std::string> &Entry : Entries) {
*d415bd75Srobert    char32_t Codepoint = Entry.first;
*d415bd75Srobert    const std::string &Name = Entry.second;
*d415bd75Srobert    // Ignore names which are not valid.
*d415bd75Srobert    if (Name.empty() || !llvm::all_of(Name, [](char C) {
*d415bd75Srobert          return llvm::is_contained(Letters, C);
*d415bd75Srobert        })) {
*d415bd75Srobert      continue;
*d415bd75Srobert    }
*d415bd75Srobert    printf("%06x: %s\n", static_cast<unsigned int>(Codepoint), Name.c_str());
*d415bd75Srobert    T.insert(Name, Codepoint);
*d415bd75Srobert    LongestName =
*d415bd75Srobert        std::max(LongestName, std::size_t(llvm::count_if(Name, llvm::isAlnum)));
*d415bd75Srobert    NameCount++;
*d415bd75Srobert  }
*d415bd75Srobert  T.compact();
*d415bd75Srobert
*d415bd75Srobert  std::pair<std::string, std::vector<uint8_t>> Data = T.serialize();
*d415bd75Srobert  const std::string &Dict = Data.first;
*d415bd75Srobert  const std::vector<uint8_t> &Tree = Data.second;
*d415bd75Srobert
*d415bd75Srobert  fprintf(Out, R"(
*d415bd75Srobert//===------------- Support/UnicodeNameToCodepointGenerated.cpp ------------===//
*d415bd75Srobert// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
*d415bd75Srobert// See https://llvm.org/LICENSE.txt for license information.
*d415bd75Srobert// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*d415bd75Srobert//
*d415bd75Srobert//===----------------------------------------------------------------------===//
*d415bd75Srobert//
*d415bd75Srobert// This file implements mapping the name of a unicode code point to its value.
*d415bd75Srobert//
*d415bd75Srobert// This file was generated using %s.
*d415bd75Srobert// Do not edit manually.
*d415bd75Srobert//
*d415bd75Srobert//===----------------------------------------------------------------------===//
*d415bd75Srobert%s
*d415bd75Srobert
*d415bd75Srobert
*d415bd75Srobert
*d415bd75Srobert#include "llvm/Support/Compiler.h"
*d415bd75Srobert#include <cstddef>
*d415bd75Srobert#include <cstdint>
*d415bd75Srobert)",
*d415bd75Srobert          argv[0], UnicodeLicense);
*d415bd75Srobert
*d415bd75Srobert  fprintf(Out,
*d415bd75Srobert          "namespace llvm { namespace sys { namespace unicode { \n"
*d415bd75Srobert          "extern const char *UnicodeNameToCodepointDict;\n"
*d415bd75Srobert          "extern const uint8_t *UnicodeNameToCodepointIndex;\n"
*d415bd75Srobert          "extern const std::size_t UnicodeNameToCodepointIndexSize;\n"
*d415bd75Srobert          "extern const std::size_t UnicodeNameToCodepointLargestNameSize;\n");
*d415bd75Srobert
*d415bd75Srobert  fprintf(Out, "const char* UnicodeNameToCodepointDict = \"%s\";\n",
*d415bd75Srobert          Dict.c_str());
*d415bd75Srobert
*d415bd75Srobert  fprintf(Out, "uint8_t UnicodeNameToCodepointIndex_[%zu] = {\n",
*d415bd75Srobert          Tree.size() + 1);
*d415bd75Srobert
*d415bd75Srobert  for (auto Byte : Tree) {
*d415bd75Srobert    fprintf(Out, "0x%02x,", Byte);
*d415bd75Srobert  }
*d415bd75Srobert
*d415bd75Srobert  fprintf(Out, "0};");
*d415bd75Srobert  fprintf(Out, "const uint8_t* UnicodeNameToCodepointIndex = "
*d415bd75Srobert               "UnicodeNameToCodepointIndex_; \n");
*d415bd75Srobert  fprintf(Out, "const std::size_t UnicodeNameToCodepointIndexSize = %zu;\n",
*d415bd75Srobert          Tree.size() + 1);
*d415bd75Srobert  fprintf(Out,
*d415bd75Srobert          "const std::size_t UnicodeNameToCodepointLargestNameSize = %zu;\n",
*d415bd75Srobert          LongestName);
*d415bd75Srobert  fprintf(Out, "\n}}}\n");
*d415bd75Srobert  fclose(Out);
*d415bd75Srobert  printf("Generated %s: %u Files.\nIndex: %f kB, Dictionary: %f kB.\nDone\n\n",
*d415bd75Srobert         argv[3], NameCount, Tree.size() / 1024.0, Dict.size() / 1024.0);
*d415bd75Srobert}
*d415bd75Srobert
*d415bd75Srobertconst char *UnicodeLicense = R"(
*d415bd75Srobert/*
*d415bd75SrobertUNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
*d415bd75Srobert
*d415bd75SrobertSee Terms of Use <https://www.unicode.org/copyright.html>
*d415bd75Srobertfor definitions of Unicode Inc.’s Data Files and Software.
*d415bd75Srobert
*d415bd75SrobertNOTICE TO USER: Carefully read the following legal agreement.
*d415bd75SrobertBY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
*d415bd75SrobertDATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
*d415bd75SrobertYOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
*d415bd75SrobertTERMS AND CONDITIONS OF THIS AGREEMENT.
*d415bd75SrobertIF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
*d415bd75SrobertTHE DATA FILES OR SOFTWARE.
*d415bd75Srobert
*d415bd75SrobertCOPYRIGHT AND PERMISSION NOTICE
*d415bd75Srobert
*d415bd75SrobertCopyright © 1991-2022 Unicode, Inc. All rights reserved.
*d415bd75SrobertDistributed under the Terms of Use in https://www.unicode.org/copyright.html.
*d415bd75Srobert
*d415bd75SrobertPermission is hereby granted, free of charge, to any person obtaining
*d415bd75Sroberta copy of the Unicode data files and any associated documentation
*d415bd75Srobert(the "Data Files") or Unicode software and any associated documentation
*d415bd75Srobert(the "Software") to deal in the Data Files or Software
*d415bd75Srobertwithout restriction, including without limitation the rights to use,
*d415bd75Srobertcopy, modify, merge, publish, distribute, and/or sell copies of
*d415bd75Srobertthe Data Files or Software, and to permit persons to whom the Data Files
*d415bd75Srobertor Software are furnished to do so, provided that either
*d415bd75Srobert(a) this copyright and permission notice appear with all copies
*d415bd75Srobertof the Data Files or Software, or
*d415bd75Srobert(b) this copyright and permission notice appear in associated
*d415bd75SrobertDocumentation.
*d415bd75Srobert
*d415bd75SrobertTHE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
*d415bd75SrobertANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
*d415bd75SrobertWARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
*d415bd75SrobertNONINFRINGEMENT OF THIRD PARTY RIGHTS.
*d415bd75SrobertIN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
*d415bd75SrobertNOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
*d415bd75SrobertDAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
*d415bd75SrobertDATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
*d415bd75SrobertTORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
*d415bd75SrobertPERFORMANCE OF THE DATA FILES OR SOFTWARE.
*d415bd75Srobert
*d415bd75SrobertExcept as contained in this notice, the name of a copyright holder
*d415bd75Srobertshall not be used in advertising or otherwise to promote the sale,
*d415bd75Srobertuse or other dealings in these Data Files or Software without prior
*d415bd75Srobertwritten authorization of the copyright holder.
*d415bd75Srobert*/
*d415bd75Srobert)";