xref: /llvm-project/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp (revision 76bbbcb41bcf4a1d7a26bb11b78cf97b60ea7d4b)
1c3574ef7Sserge-sans-paille //===--- BuildConfusableTable.cpp - clang-tidy---------------------------===//
2c3574ef7Sserge-sans-paille //
3c3574ef7Sserge-sans-paille // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4c3574ef7Sserge-sans-paille // See https://llvm.org/LICENSE.txt for license information.
5c3574ef7Sserge-sans-paille // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6c3574ef7Sserge-sans-paille //
7c3574ef7Sserge-sans-paille //===----------------------------------------------------------------------===//
8aba43035SDmitri Gribenko 
9aba43035SDmitri Gribenko #include "llvm/ADT/STLExtras.h"
10c3574ef7Sserge-sans-paille #include "llvm/ADT/StringExtras.h"
11c3574ef7Sserge-sans-paille #include "llvm/Support/ConvertUTF.h"
12c3574ef7Sserge-sans-paille #include "llvm/Support/MemoryBuffer.h"
13c3574ef7Sserge-sans-paille #include "llvm/Support/raw_ostream.h"
14c3574ef7Sserge-sans-paille 
15c3574ef7Sserge-sans-paille using namespace llvm;
16c3574ef7Sserge-sans-paille 
main(int argc,char * argv[])17c3574ef7Sserge-sans-paille int main(int argc, char *argv[]) {
18c3574ef7Sserge-sans-paille   auto ErrorOrBuffer = MemoryBuffer::getFile(argv[1], true);
19c3574ef7Sserge-sans-paille   if (!ErrorOrBuffer)
20c3574ef7Sserge-sans-paille     return 1;
21c3574ef7Sserge-sans-paille   std::unique_ptr<MemoryBuffer> Buffer = std::move(ErrorOrBuffer.get());
22c3574ef7Sserge-sans-paille   StringRef Content = Buffer->getBuffer();
23c3574ef7Sserge-sans-paille   Content = Content.drop_until([](char c) { return c == '#'; });
24c3574ef7Sserge-sans-paille   SmallVector<StringRef> Lines;
25c3574ef7Sserge-sans-paille   SplitString(Content, Lines, "\r\n");
26c3574ef7Sserge-sans-paille 
27c3574ef7Sserge-sans-paille   std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries;
28c3574ef7Sserge-sans-paille   SmallVector<StringRef> Values;
29c3574ef7Sserge-sans-paille   for (StringRef Line : Lines) {
30*76bbbcb4SKazu Hirata     if (Line.starts_with("#"))
31c3574ef7Sserge-sans-paille       continue;
32c3574ef7Sserge-sans-paille 
33c3574ef7Sserge-sans-paille     Values.clear();
34c3574ef7Sserge-sans-paille     Line.split(Values, ';');
35c3574ef7Sserge-sans-paille     if (Values.size() < 2) {
36c3574ef7Sserge-sans-paille       errs() << "Failed to parse: " << Line << "\n";
37c3574ef7Sserge-sans-paille       return 2;
38c3574ef7Sserge-sans-paille     }
39c3574ef7Sserge-sans-paille 
40c3574ef7Sserge-sans-paille     llvm::StringRef From = Values[0].trim();
41cbdc3e1bSPiotr Zegar     llvm::UTF32 CodePoint = 0;
42c3574ef7Sserge-sans-paille     From.getAsInteger(16, CodePoint);
43c3574ef7Sserge-sans-paille 
44c3574ef7Sserge-sans-paille     SmallVector<llvm::UTF32> To;
45c3574ef7Sserge-sans-paille     SmallVector<StringRef> ToN;
46c3574ef7Sserge-sans-paille     Values[1].split(ToN, ' ', -1, false);
47c3574ef7Sserge-sans-paille     for (StringRef To_ : ToN) {
48cbdc3e1bSPiotr Zegar       llvm::UTF32 ToCodePoint = 0;
49c3574ef7Sserge-sans-paille       To_.trim().getAsInteger(16, ToCodePoint);
50c3574ef7Sserge-sans-paille       To.push_back(ToCodePoint);
51c3574ef7Sserge-sans-paille     }
52c3574ef7Sserge-sans-paille     // Sentinel
53c3574ef7Sserge-sans-paille     To.push_back(0);
54c3574ef7Sserge-sans-paille 
55c3574ef7Sserge-sans-paille     Entries.emplace_back(CodePoint, To);
56c3574ef7Sserge-sans-paille   }
57aba43035SDmitri Gribenko   llvm::sort(Entries);
58c3574ef7Sserge-sans-paille 
59c3574ef7Sserge-sans-paille   unsigned LargestValue =
60c3574ef7Sserge-sans-paille       std::max_element(Entries.begin(), Entries.end(),
61c3574ef7Sserge-sans-paille                        [](const auto &Entry0, const auto &Entry1) {
62c3574ef7Sserge-sans-paille                          return Entry0.second.size() < Entry1.second.size();
63c3574ef7Sserge-sans-paille                        })
64c3574ef7Sserge-sans-paille           ->second.size();
65c3574ef7Sserge-sans-paille 
66c3574ef7Sserge-sans-paille   std::error_code ec;
67c3574ef7Sserge-sans-paille   llvm::raw_fd_ostream os(argv[2], ec);
68c3574ef7Sserge-sans-paille 
69c3574ef7Sserge-sans-paille   // FIXME: If memory consumption and/or lookup time becomes a constraint, it
70c3574ef7Sserge-sans-paille   // maybe worth using a more elaborate data structure.
71c3574ef7Sserge-sans-paille   os << "struct {llvm::UTF32 codepoint; llvm::UTF32 values[" << LargestValue
72c3574ef7Sserge-sans-paille      << "];} "
73c3574ef7Sserge-sans-paille         "ConfusableEntries[] = {\n";
74c3574ef7Sserge-sans-paille   for (const auto &Values : Entries) {
75c3574ef7Sserge-sans-paille     os << "  { ";
76c3574ef7Sserge-sans-paille     os << Values.first;
77c3574ef7Sserge-sans-paille     os << ", {";
78c3574ef7Sserge-sans-paille     for (auto CP : Values.second)
79c3574ef7Sserge-sans-paille       os << CP << ", ";
80c3574ef7Sserge-sans-paille 
81c3574ef7Sserge-sans-paille     os << "}},\n";
82c3574ef7Sserge-sans-paille   }
83c3574ef7Sserge-sans-paille   os << "};\n";
84c3574ef7Sserge-sans-paille   return 0;
85c3574ef7Sserge-sans-paille }
86