xref: /llvm-project/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp (revision 76bbbcb41bcf4a1d7a26bb11b78cf97b60ea7d4b)
1 //===--- BuildConfusableTable.cpp - clang-tidy---------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "llvm/ADT/STLExtras.h"
10 #include "llvm/ADT/StringExtras.h"
11 #include "llvm/Support/ConvertUTF.h"
12 #include "llvm/Support/MemoryBuffer.h"
13 #include "llvm/Support/raw_ostream.h"
14 
15 using namespace llvm;
16 
main(int argc,char * argv[])17 int main(int argc, char *argv[]) {
18   auto ErrorOrBuffer = MemoryBuffer::getFile(argv[1], true);
19   if (!ErrorOrBuffer)
20     return 1;
21   std::unique_ptr<MemoryBuffer> Buffer = std::move(ErrorOrBuffer.get());
22   StringRef Content = Buffer->getBuffer();
23   Content = Content.drop_until([](char c) { return c == '#'; });
24   SmallVector<StringRef> Lines;
25   SplitString(Content, Lines, "\r\n");
26 
27   std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries;
28   SmallVector<StringRef> Values;
29   for (StringRef Line : Lines) {
30     if (Line.starts_with("#"))
31       continue;
32 
33     Values.clear();
34     Line.split(Values, ';');
35     if (Values.size() < 2) {
36       errs() << "Failed to parse: " << Line << "\n";
37       return 2;
38     }
39 
40     llvm::StringRef From = Values[0].trim();
41     llvm::UTF32 CodePoint = 0;
42     From.getAsInteger(16, CodePoint);
43 
44     SmallVector<llvm::UTF32> To;
45     SmallVector<StringRef> ToN;
46     Values[1].split(ToN, ' ', -1, false);
47     for (StringRef To_ : ToN) {
48       llvm::UTF32 ToCodePoint = 0;
49       To_.trim().getAsInteger(16, ToCodePoint);
50       To.push_back(ToCodePoint);
51     }
52     // Sentinel
53     To.push_back(0);
54 
55     Entries.emplace_back(CodePoint, To);
56   }
57   llvm::sort(Entries);
58 
59   unsigned LargestValue =
60       std::max_element(Entries.begin(), Entries.end(),
61                        [](const auto &Entry0, const auto &Entry1) {
62                          return Entry0.second.size() < Entry1.second.size();
63                        })
64           ->second.size();
65 
66   std::error_code ec;
67   llvm::raw_fd_ostream os(argv[2], ec);
68 
69   // FIXME: If memory consumption and/or lookup time becomes a constraint, it
70   // maybe worth using a more elaborate data structure.
71   os << "struct {llvm::UTF32 codepoint; llvm::UTF32 values[" << LargestValue
72      << "];} "
73         "ConfusableEntries[] = {\n";
74   for (const auto &Values : Entries) {
75     os << "  { ";
76     os << Values.first;
77     os << ", {";
78     for (auto CP : Values.second)
79       os << CP << ", ";
80 
81     os << "}},\n";
82   }
83   os << "};\n";
84   return 0;
85 }
86