xref: /llvm-project/llvm/include/llvm/TableGen/StringToOffsetTable.h (revision f4de28a63c81c909df28b6b065fad19e2189c54e)
1 //===- StringToOffsetTable.h - Emit a big concatenated string ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LLVM_TABLEGEN_STRINGTOOFFSETTABLE_H
10 #define LLVM_TABLEGEN_STRINGTOOFFSETTABLE_H
11 
12 #include "llvm/ADT/SmallString.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/StringMap.h"
15 #include "llvm/Support/FormatVariadic.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <optional>
18 
19 namespace llvm {
20 
21 /// StringToOffsetTable - This class uniques a bunch of nul-terminated strings
22 /// and keeps track of their offset in a massive contiguous string allocation.
23 /// It can then output this string blob and use indexes into the string to
24 /// reference each piece.
25 class StringToOffsetTable {
26   StringMap<unsigned> StringOffset;
27   std::string AggregateString;
28 
29 public:
30   StringToOffsetTable() {
31     // Ensure we always put the empty string at offset zero. That lets empty
32     // initialization also be zero initialization for offsets into the table.
33     GetOrAddStringOffset("");
34   }
35 
36   bool empty() const { return StringOffset.empty(); }
37   size_t size() const { return AggregateString.size(); }
38 
39   unsigned GetOrAddStringOffset(StringRef Str, bool appendZero = true) {
40     auto [II, Inserted] = StringOffset.insert({Str, size()});
41     if (Inserted) {
42       // Add the string to the aggregate if this is the first time found.
43       AggregateString.append(Str.begin(), Str.end());
44       if (appendZero)
45         AggregateString += '\0';
46     }
47 
48     return II->second;
49   }
50 
51   // Returns the offset of `Str` in the table if its preset, else return
52   // std::nullopt.
53   std::optional<unsigned> GetStringOffset(StringRef Str) const {
54     auto II = StringOffset.find(Str);
55     if (II == StringOffset.end())
56       return std::nullopt;
57     return II->second;
58   }
59 
60   // Emit a string table definition with the provided name and indent.
61   //
62   // When possible, this uses string-literal concatenation to emit the string
63   // contents in a readable and searchable way. However, for (very) large string
64   // tables MSVC cannot reliably use string literals and so there we use a large
65   // character array. We still use a line oriented emission and add comments to
66   // provide searchability even in this case.
67   //
68   // The string table, and its input string contents, are always emitted as both
69   // `static` and `constexpr`. Both `Name` and (`Name` + "Storage") must be
70   // valid identifiers to declare.
71   void EmitStringTableDef(raw_ostream &OS, const Twine &Name,
72                           const Twine &Indent = "") const {
73     OS << formatv(R"(
74 #ifdef __GNUC__
75 #pragma GCC diagnostic push
76 #pragma GCC diagnostic ignored "-Woverlength-strings"
77 #endif
78 {0}static constexpr char {1}Storage[] = )",
79                   Indent, Name);
80 
81     // MSVC silently miscompiles string literals longer than 64k in some
82     // circumstances. When the string table is longer, emit it as an array of
83     // character literals.
84     bool UseChars = AggregateString.size() > (64 * 1024);
85     OS << (UseChars ? "{\n" : "\n");
86 
87     llvm::ListSeparator LineSep(UseChars ? ",\n" : "\n");
88     llvm::SmallVector<StringRef> Strings(split(AggregateString, '\0'));
89     // We should always have an empty string at the start, and because these are
90     // null terminators rather than separators, we'll have one at the end as
91     // well. Skip the end one.
92     assert(Strings.front().empty() && "Expected empty initial string!");
93     assert(Strings.back().empty() &&
94            "Expected empty string at the end due to terminators!");
95     Strings.pop_back();
96     for (StringRef Str : Strings) {
97       OS << LineSep << Indent << "  ";
98       // If we can, just emit this as a string literal to be concatenated.
99       if (!UseChars) {
100         OS << "\"";
101         OS.write_escaped(Str);
102         OS << "\\0\"";
103         continue;
104       }
105 
106       llvm::ListSeparator CharSep(", ");
107       for (char C : Str) {
108         OS << CharSep << "'";
109         OS.write_escaped(StringRef(&C, 1));
110         OS << "'";
111       }
112       OS << CharSep << "'\\0'";
113     }
114     OS << LineSep << Indent << (UseChars ? "};" : "  ;");
115 
116     OS << formatv(R"(
117 #ifdef __GNUC__
118 #pragma GCC diagnostic pop
119 #endif
120 
121 {0}static constexpr llvm::StringTable {1} =
122 {0}    {1}Storage;
123 )",
124                   Indent, Name);
125   }
126 
127   // Emit the string as one single string.
128   void EmitString(raw_ostream &O) const {
129     // Escape the string.
130     SmallString<256> EscapedStr;
131     raw_svector_ostream(EscapedStr).write_escaped(AggregateString);
132 
133     O << "    \"";
134     unsigned CharsPrinted = 0;
135     for (unsigned i = 0, e = EscapedStr.size(); i != e; ++i) {
136       if (CharsPrinted > 70) {
137         O << "\"\n    \"";
138         CharsPrinted = 0;
139       }
140       O << EscapedStr[i];
141       ++CharsPrinted;
142 
143       // Print escape sequences all together.
144       if (EscapedStr[i] != '\\')
145         continue;
146 
147       assert(i + 1 < EscapedStr.size() && "Incomplete escape sequence!");
148       if (isDigit(EscapedStr[i + 1])) {
149         assert(isDigit(EscapedStr[i + 2]) && isDigit(EscapedStr[i + 3]) &&
150                "Expected 3 digit octal escape!");
151         O << EscapedStr[++i];
152         O << EscapedStr[++i];
153         O << EscapedStr[++i];
154         CharsPrinted += 3;
155       } else {
156         O << EscapedStr[++i];
157         ++CharsPrinted;
158       }
159     }
160     O << "\"";
161   }
162 };
163 
164 } // end namespace llvm
165 
166 #endif
167