xref: /openbsd-src/gnu/llvm/lld/MachO/ExportTrie.cpp (revision dfe94b169149f14cc1aee2cf6dad58a8d9a1860c)
1bb684c34Spatrick //===- ExportTrie.cpp -----------------------------------------------------===//
2bb684c34Spatrick //
3bb684c34Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4bb684c34Spatrick // See https://llvm.org/LICENSE.txt for license information.
5bb684c34Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6bb684c34Spatrick //
7bb684c34Spatrick //===----------------------------------------------------------------------===//
8bb684c34Spatrick //
9bb684c34Spatrick // This is a partial implementation of the Mach-O export trie format. It's
10bb684c34Spatrick // essentially a symbol table encoded as a compressed prefix trie, meaning that
11bb684c34Spatrick // the common prefixes of each symbol name are shared for a more compact
12bb684c34Spatrick // representation. The prefixes are stored on the edges of the trie, and one
13bb684c34Spatrick // edge can represent multiple characters. For example, given two exported
14bb684c34Spatrick // symbols _bar and _baz, we will have a trie like this (terminal nodes are
15bb684c34Spatrick // marked with an asterisk):
16bb684c34Spatrick //
17bb684c34Spatrick //              +-+-+
18bb684c34Spatrick //              |   | // root node
19bb684c34Spatrick //              +-+-+
20bb684c34Spatrick //                |
21bb684c34Spatrick //                | _ba
22bb684c34Spatrick //                |
23bb684c34Spatrick //              +-+-+
24bb684c34Spatrick //              |   |
25bb684c34Spatrick //              +-+-+
26bb684c34Spatrick //           r /     \ z
27bb684c34Spatrick //            /       \
28bb684c34Spatrick //        +-+-+       +-+-+
29bb684c34Spatrick //        | * |       | * |
30bb684c34Spatrick //        +-+-+       +-+-+
31bb684c34Spatrick //
32bb684c34Spatrick // More documentation of the format can be found in
33bb684c34Spatrick // llvm/tools/obj2yaml/macho2yaml.cpp.
34bb684c34Spatrick //
35bb684c34Spatrick //===----------------------------------------------------------------------===//
36bb684c34Spatrick 
37bb684c34Spatrick #include "ExportTrie.h"
38bb684c34Spatrick #include "Symbols.h"
39bb684c34Spatrick 
40bb684c34Spatrick #include "lld/Common/ErrorHandler.h"
41bb684c34Spatrick #include "lld/Common/Memory.h"
42bb684c34Spatrick #include "llvm/BinaryFormat/MachO.h"
43bb684c34Spatrick #include "llvm/Support/LEB128.h"
44*dfe94b16Srobert #include <optional>
45bb684c34Spatrick 
46bb684c34Spatrick using namespace llvm;
47bb684c34Spatrick using namespace lld;
48bb684c34Spatrick using namespace lld::macho;
49bb684c34Spatrick 
50bb684c34Spatrick namespace {
51bb684c34Spatrick 
52bb684c34Spatrick struct Edge {
Edge__anon13b4c56f0111::Edge53bb684c34Spatrick   Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
54bb684c34Spatrick 
55bb684c34Spatrick   StringRef substring;
56bb684c34Spatrick   struct TrieNode *child;
57bb684c34Spatrick };
58bb684c34Spatrick 
59bb684c34Spatrick struct ExportInfo {
60bb684c34Spatrick   uint64_t address;
611cf9926bSpatrick   uint8_t flags = 0;
ExportInfo__anon13b4c56f0111::ExportInfo621cf9926bSpatrick   ExportInfo(const Symbol &sym, uint64_t imageBase)
631cf9926bSpatrick       : address(sym.getVA() - imageBase) {
641cf9926bSpatrick     using namespace llvm::MachO;
651cf9926bSpatrick     // Set the symbol type.
661cf9926bSpatrick     if (sym.isWeakDef())
671cf9926bSpatrick       flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION;
68bb684c34Spatrick     // TODO: Add proper support for re-exports & stub-and-resolver flags.
691cf9926bSpatrick 
701cf9926bSpatrick     // Set the symbol kind.
711cf9926bSpatrick     if (sym.isTlv()) {
721cf9926bSpatrick       flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL;
731cf9926bSpatrick     } else if (auto *defined = dyn_cast<Defined>(&sym)) {
741cf9926bSpatrick       if (defined->isAbsolute())
751cf9926bSpatrick         flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE;
761cf9926bSpatrick     }
771cf9926bSpatrick   }
78bb684c34Spatrick };
79bb684c34Spatrick 
80bb684c34Spatrick } // namespace
81bb684c34Spatrick 
82bb684c34Spatrick struct macho::TrieNode {
83bb684c34Spatrick   std::vector<Edge> edges;
84*dfe94b16Srobert   std::optional<ExportInfo> info;
85bb684c34Spatrick   // Estimated offset from the start of the serialized trie to the current node.
86bb684c34Spatrick   // This will converge to the true offset when updateOffset() is run to a
87bb684c34Spatrick   // fixpoint.
88bb684c34Spatrick   size_t offset = 0;
89bb684c34Spatrick 
90bb684c34Spatrick   // Returns whether the new estimated offset differs from the old one.
91bb684c34Spatrick   bool updateOffset(size_t &nextOffset);
92bb684c34Spatrick   void writeTo(uint8_t *buf) const;
93bb684c34Spatrick };
94bb684c34Spatrick 
updateOffset(size_t & nextOffset)95bb684c34Spatrick bool TrieNode::updateOffset(size_t &nextOffset) {
96bb684c34Spatrick   // Size of the whole node (including the terminalSize and the outgoing edges.)
97bb684c34Spatrick   // In contrast, terminalSize only records the size of the other data in the
98bb684c34Spatrick   // node.
99bb684c34Spatrick   size_t nodeSize;
100bb684c34Spatrick   if (info) {
101bb684c34Spatrick     uint32_t terminalSize =
1021cf9926bSpatrick         getULEB128Size(info->flags) + getULEB128Size(info->address);
103bb684c34Spatrick     // Overall node size so far is the uleb128 size of the length of the symbol
104bb684c34Spatrick     // info + the symbol info itself.
105bb684c34Spatrick     nodeSize = terminalSize + getULEB128Size(terminalSize);
106bb684c34Spatrick   } else {
107bb684c34Spatrick     nodeSize = 1; // Size of terminalSize (which has a value of 0)
108bb684c34Spatrick   }
109bb684c34Spatrick   // Compute size of all child edges.
110bb684c34Spatrick   ++nodeSize; // Byte for number of children.
1111cf9926bSpatrick   for (const Edge &edge : edges) {
112bb684c34Spatrick     nodeSize += edge.substring.size() + 1             // String length.
113bb684c34Spatrick                 + getULEB128Size(edge.child->offset); // Offset len.
114bb684c34Spatrick   }
115bb684c34Spatrick   // On input, 'nextOffset' is the new preferred location for this node.
116bb684c34Spatrick   bool result = (offset != nextOffset);
117bb684c34Spatrick   // Store new location in node object for use by parents.
118bb684c34Spatrick   offset = nextOffset;
119bb684c34Spatrick   nextOffset += nodeSize;
120bb684c34Spatrick   return result;
121bb684c34Spatrick }
122bb684c34Spatrick 
writeTo(uint8_t * buf) const123bb684c34Spatrick void TrieNode::writeTo(uint8_t *buf) const {
124bb684c34Spatrick   buf += offset;
125bb684c34Spatrick   if (info) {
126bb684c34Spatrick     // TrieNodes with Symbol info: size, flags address
127bb684c34Spatrick     uint32_t terminalSize =
1281cf9926bSpatrick         getULEB128Size(info->flags) + getULEB128Size(info->address);
129bb684c34Spatrick     buf += encodeULEB128(terminalSize, buf);
1301cf9926bSpatrick     buf += encodeULEB128(info->flags, buf);
131bb684c34Spatrick     buf += encodeULEB128(info->address, buf);
132bb684c34Spatrick   } else {
133bb684c34Spatrick     // TrieNode with no Symbol info.
134bb684c34Spatrick     *buf++ = 0; // terminalSize
135bb684c34Spatrick   }
136bb684c34Spatrick   // Add number of children. TODO: Handle case where we have more than 256.
137bb684c34Spatrick   assert(edges.size() < 256);
138bb684c34Spatrick   *buf++ = edges.size();
139bb684c34Spatrick   // Append each child edge substring and node offset.
140bb684c34Spatrick   for (const Edge &edge : edges) {
141bb684c34Spatrick     memcpy(buf, edge.substring.data(), edge.substring.size());
142bb684c34Spatrick     buf += edge.substring.size();
143bb684c34Spatrick     *buf++ = '\0';
144bb684c34Spatrick     buf += encodeULEB128(edge.child->offset, buf);
145bb684c34Spatrick   }
146bb684c34Spatrick }
147bb684c34Spatrick 
~TrieBuilder()148*dfe94b16Srobert TrieBuilder::~TrieBuilder() {
149*dfe94b16Srobert   for (TrieNode *node : nodes)
150*dfe94b16Srobert     delete node;
151*dfe94b16Srobert }
152*dfe94b16Srobert 
makeNode()153bb684c34Spatrick TrieNode *TrieBuilder::makeNode() {
154*dfe94b16Srobert   auto *node = new TrieNode();
155bb684c34Spatrick   nodes.emplace_back(node);
156bb684c34Spatrick   return node;
157bb684c34Spatrick }
158bb684c34Spatrick 
charAt(const Symbol * sym,size_t pos)159bb684c34Spatrick static int charAt(const Symbol *sym, size_t pos) {
160bb684c34Spatrick   StringRef str = sym->getName();
161bb684c34Spatrick   if (pos >= str.size())
162bb684c34Spatrick     return -1;
163bb684c34Spatrick   return str[pos];
164bb684c34Spatrick }
165bb684c34Spatrick 
166bb684c34Spatrick // Build the trie by performing a three-way radix quicksort: We start by sorting
167bb684c34Spatrick // the strings by their first characters, then sort the strings with the same
168bb684c34Spatrick // first characters by their second characters, and so on recursively. Each
169bb684c34Spatrick // time the prefixes diverge, we add a node to the trie.
170bb684c34Spatrick //
171bb684c34Spatrick // node:    The most recently created node along this path in the trie (i.e.
172bb684c34Spatrick //          the furthest from the root.)
173bb684c34Spatrick // lastPos: The prefix length of the most recently created node, i.e. the number
174bb684c34Spatrick //          of characters along its path from the root.
175bb684c34Spatrick // pos:     The string index we are currently sorting on. Note that each symbol
176bb684c34Spatrick //          S contained in vec has the same prefix S[0...pos).
sortAndBuild(MutableArrayRef<const Symbol * > vec,TrieNode * node,size_t lastPos,size_t pos)177bb684c34Spatrick void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
178bb684c34Spatrick                                TrieNode *node, size_t lastPos, size_t pos) {
179bb684c34Spatrick tailcall:
180bb684c34Spatrick   if (vec.empty())
181bb684c34Spatrick     return;
182bb684c34Spatrick 
183bb684c34Spatrick   // Partition items so that items in [0, i) are less than the pivot,
184bb684c34Spatrick   // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
185bb684c34Spatrick   // the pivot.
186bb684c34Spatrick   const Symbol *pivotSymbol = vec[vec.size() / 2];
187bb684c34Spatrick   int pivot = charAt(pivotSymbol, pos);
188bb684c34Spatrick   size_t i = 0;
189bb684c34Spatrick   size_t j = vec.size();
190bb684c34Spatrick   for (size_t k = 0; k < j;) {
191bb684c34Spatrick     int c = charAt(vec[k], pos);
192bb684c34Spatrick     if (c < pivot)
193bb684c34Spatrick       std::swap(vec[i++], vec[k++]);
194bb684c34Spatrick     else if (c > pivot)
195bb684c34Spatrick       std::swap(vec[--j], vec[k]);
196bb684c34Spatrick     else
197bb684c34Spatrick       k++;
198bb684c34Spatrick   }
199bb684c34Spatrick 
200bb684c34Spatrick   bool isTerminal = pivot == -1;
201bb684c34Spatrick   bool prefixesDiverge = i != 0 || j != vec.size();
202bb684c34Spatrick   if (lastPos != pos && (isTerminal || prefixesDiverge)) {
203bb684c34Spatrick     TrieNode *newNode = makeNode();
204bb684c34Spatrick     node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
205bb684c34Spatrick                              newNode);
206bb684c34Spatrick     node = newNode;
207bb684c34Spatrick     lastPos = pos;
208bb684c34Spatrick   }
209bb684c34Spatrick 
210bb684c34Spatrick   sortAndBuild(vec.slice(0, i), node, lastPos, pos);
211bb684c34Spatrick   sortAndBuild(vec.slice(j), node, lastPos, pos);
212bb684c34Spatrick 
213bb684c34Spatrick   if (isTerminal) {
214bb684c34Spatrick     assert(j - i == 1); // no duplicate symbols
2151cf9926bSpatrick     node->info = ExportInfo(*pivotSymbol, imageBase);
216bb684c34Spatrick   } else {
217bb684c34Spatrick     // This is the tail-call-optimized version of the following:
218bb684c34Spatrick     // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
219bb684c34Spatrick     vec = vec.slice(i, j - i);
220bb684c34Spatrick     ++pos;
221bb684c34Spatrick     goto tailcall;
222bb684c34Spatrick   }
223bb684c34Spatrick }
224bb684c34Spatrick 
build()225bb684c34Spatrick size_t TrieBuilder::build() {
226bb684c34Spatrick   if (exported.empty())
227bb684c34Spatrick     return 0;
228bb684c34Spatrick 
229bb684c34Spatrick   TrieNode *root = makeNode();
230bb684c34Spatrick   sortAndBuild(exported, root, 0, 0);
231bb684c34Spatrick 
232bb684c34Spatrick   // Assign each node in the vector an offset in the trie stream, iterating
233bb684c34Spatrick   // until all uleb128 sizes have stabilized.
234bb684c34Spatrick   size_t offset;
235bb684c34Spatrick   bool more;
236bb684c34Spatrick   do {
237bb684c34Spatrick     offset = 0;
238bb684c34Spatrick     more = false;
239bb684c34Spatrick     for (TrieNode *node : nodes)
240bb684c34Spatrick       more |= node->updateOffset(offset);
241bb684c34Spatrick   } while (more);
242bb684c34Spatrick 
243bb684c34Spatrick   return offset;
244bb684c34Spatrick }
245bb684c34Spatrick 
writeTo(uint8_t * buf) const246bb684c34Spatrick void TrieBuilder::writeTo(uint8_t *buf) const {
247bb684c34Spatrick   for (TrieNode *node : nodes)
248bb684c34Spatrick     node->writeTo(buf);
249bb684c34Spatrick }
250bb684c34Spatrick 
251bb684c34Spatrick namespace {
252bb684c34Spatrick 
253bb684c34Spatrick // Parse a serialized trie and invoke a callback for each entry.
254bb684c34Spatrick class TrieParser {
255bb684c34Spatrick public:
TrieParser(const uint8_t * buf,size_t size,const TrieEntryCallback & callback)256bb684c34Spatrick   TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
257bb684c34Spatrick       : start(buf), end(start + size), callback(callback) {}
258bb684c34Spatrick 
259bb684c34Spatrick   void parse(const uint8_t *buf, const Twine &cumulativeString);
260bb684c34Spatrick 
parse()261bb684c34Spatrick   void parse() { parse(start, ""); }
262bb684c34Spatrick 
263bb684c34Spatrick   const uint8_t *start;
264bb684c34Spatrick   const uint8_t *end;
265bb684c34Spatrick   const TrieEntryCallback &callback;
266bb684c34Spatrick };
267bb684c34Spatrick 
268bb684c34Spatrick } // namespace
269bb684c34Spatrick 
parse(const uint8_t * buf,const Twine & cumulativeString)270bb684c34Spatrick void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
271bb684c34Spatrick   if (buf >= end)
272bb684c34Spatrick     fatal("Node offset points outside export section");
273bb684c34Spatrick 
274bb684c34Spatrick   unsigned ulebSize;
275bb684c34Spatrick   uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
276bb684c34Spatrick   buf += ulebSize;
277bb684c34Spatrick   uint64_t flags = 0;
278bb684c34Spatrick   size_t offset;
279bb684c34Spatrick   if (terminalSize != 0) {
280bb684c34Spatrick     flags = decodeULEB128(buf, &ulebSize);
281bb684c34Spatrick     callback(cumulativeString, flags);
282bb684c34Spatrick   }
283bb684c34Spatrick   buf += terminalSize;
284bb684c34Spatrick   uint8_t numEdges = *buf++;
285bb684c34Spatrick   for (uint8_t i = 0; i < numEdges; ++i) {
286bb684c34Spatrick     const char *cbuf = reinterpret_cast<const char *>(buf);
287bb684c34Spatrick     StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
288bb684c34Spatrick     buf += substring.size() + 1;
289bb684c34Spatrick     offset = decodeULEB128(buf, &ulebSize);
290bb684c34Spatrick     buf += ulebSize;
291bb684c34Spatrick     parse(start + offset, cumulativeString + substring);
292bb684c34Spatrick   }
293bb684c34Spatrick }
294bb684c34Spatrick 
parseTrie(const uint8_t * buf,size_t size,const TrieEntryCallback & callback)295bb684c34Spatrick void macho::parseTrie(const uint8_t *buf, size_t size,
296bb684c34Spatrick                       const TrieEntryCallback &callback) {
297bb684c34Spatrick   if (size == 0)
298bb684c34Spatrick     return;
299bb684c34Spatrick 
300bb684c34Spatrick   TrieParser(buf, size, callback).parse();
301bb684c34Spatrick }
302