1 //===- StringToOffsetTable.h - Emit a big concatenated string ---*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_TABLEGEN_STRINGTOOFFSETTABLE_H 10 #define LLVM_TABLEGEN_STRINGTOOFFSETTABLE_H 11 12 #include "llvm/ADT/SmallString.h" 13 #include "llvm/ADT/StringExtras.h" 14 #include "llvm/ADT/StringMap.h" 15 #include "llvm/Support/FormatVariadic.h" 16 #include "llvm/Support/raw_ostream.h" 17 #include <optional> 18 19 namespace llvm { 20 21 /// StringToOffsetTable - This class uniques a bunch of nul-terminated strings 22 /// and keeps track of their offset in a massive contiguous string allocation. 23 /// It can then output this string blob and use indexes into the string to 24 /// reference each piece. 25 class StringToOffsetTable { 26 StringMap<unsigned> StringOffset; 27 std::string AggregateString; 28 29 public: 30 StringToOffsetTable() { 31 // Ensure we always put the empty string at offset zero. That lets empty 32 // initialization also be zero initialization for offsets into the table. 33 GetOrAddStringOffset(""); 34 } 35 36 bool empty() const { return StringOffset.empty(); } 37 size_t size() const { return AggregateString.size(); } 38 39 unsigned GetOrAddStringOffset(StringRef Str, bool appendZero = true) { 40 auto [II, Inserted] = StringOffset.insert({Str, size()}); 41 if (Inserted) { 42 // Add the string to the aggregate if this is the first time found. 43 AggregateString.append(Str.begin(), Str.end()); 44 if (appendZero) 45 AggregateString += '\0'; 46 } 47 48 return II->second; 49 } 50 51 // Returns the offset of `Str` in the table if its preset, else return 52 // std::nullopt. 53 std::optional<unsigned> GetStringOffset(StringRef Str) const { 54 auto II = StringOffset.find(Str); 55 if (II == StringOffset.end()) 56 return std::nullopt; 57 return II->second; 58 } 59 60 // Emit a string table definition with the provided name and indent. 61 // 62 // When possible, this uses string-literal concatenation to emit the string 63 // contents in a readable and searchable way. However, for (very) large string 64 // tables MSVC cannot reliably use string literals and so there we use a large 65 // character array. We still use a line oriented emission and add comments to 66 // provide searchability even in this case. 67 // 68 // The string table, and its input string contents, are always emitted as both 69 // `static` and `constexpr`. Both `Name` and (`Name` + "Storage") must be 70 // valid identifiers to declare. 71 void EmitStringTableDef(raw_ostream &OS, const Twine &Name, 72 const Twine &Indent = "") const { 73 OS << formatv(R"( 74 #ifdef __GNUC__ 75 #pragma GCC diagnostic push 76 #pragma GCC diagnostic ignored "-Woverlength-strings" 77 #endif 78 {0}static constexpr char {1}Storage[] = )", 79 Indent, Name); 80 81 // MSVC silently miscompiles string literals longer than 64k in some 82 // circumstances. When the string table is longer, emit it as an array of 83 // character literals. 84 bool UseChars = AggregateString.size() > (64 * 1024); 85 OS << (UseChars ? "{\n" : "\n"); 86 87 llvm::ListSeparator LineSep(UseChars ? ",\n" : "\n"); 88 llvm::SmallVector<StringRef> Strings(split(AggregateString, '\0')); 89 // We should always have an empty string at the start, and because these are 90 // null terminators rather than separators, we'll have one at the end as 91 // well. Skip the end one. 92 assert(Strings.front().empty() && "Expected empty initial string!"); 93 assert(Strings.back().empty() && 94 "Expected empty string at the end due to terminators!"); 95 Strings.pop_back(); 96 for (StringRef Str : Strings) { 97 OS << LineSep << Indent << " "; 98 // If we can, just emit this as a string literal to be concatenated. 99 if (!UseChars) { 100 OS << "\""; 101 OS.write_escaped(Str); 102 OS << "\\0\""; 103 continue; 104 } 105 106 llvm::ListSeparator CharSep(", "); 107 for (char C : Str) { 108 OS << CharSep << "'"; 109 OS.write_escaped(StringRef(&C, 1)); 110 OS << "'"; 111 } 112 OS << CharSep << "'\\0'"; 113 } 114 OS << LineSep << Indent << (UseChars ? "};" : " ;"); 115 116 OS << formatv(R"( 117 #ifdef __GNUC__ 118 #pragma GCC diagnostic pop 119 #endif 120 121 {0}static constexpr llvm::StringTable {1} = 122 {0} {1}Storage; 123 )", 124 Indent, Name); 125 } 126 127 // Emit the string as one single string. 128 void EmitString(raw_ostream &O) const { 129 // Escape the string. 130 SmallString<256> EscapedStr; 131 raw_svector_ostream(EscapedStr).write_escaped(AggregateString); 132 133 O << " \""; 134 unsigned CharsPrinted = 0; 135 for (unsigned i = 0, e = EscapedStr.size(); i != e; ++i) { 136 if (CharsPrinted > 70) { 137 O << "\"\n \""; 138 CharsPrinted = 0; 139 } 140 O << EscapedStr[i]; 141 ++CharsPrinted; 142 143 // Print escape sequences all together. 144 if (EscapedStr[i] != '\\') 145 continue; 146 147 assert(i + 1 < EscapedStr.size() && "Incomplete escape sequence!"); 148 if (isDigit(EscapedStr[i + 1])) { 149 assert(isDigit(EscapedStr[i + 2]) && isDigit(EscapedStr[i + 3]) && 150 "Expected 3 digit octal escape!"); 151 O << EscapedStr[++i]; 152 O << EscapedStr[++i]; 153 O << EscapedStr[++i]; 154 CharsPrinted += 3; 155 } else { 156 O << EscapedStr[++i]; 157 ++CharsPrinted; 158 } 159 } 160 O << "\""; 161 } 162 }; 163 164 } // end namespace llvm 165 166 #endif 167