1//===- ExportTrie.cpp -----------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is a partial implementation of the Mach-O export trie format. It's
10// essentially a symbol table encoded as a compressed prefix trie, meaning that
11// the common prefixes of each symbol name are shared for a more compact
12// representation. The prefixes are stored on the edges of the trie, and one
13// edge can represent multiple characters. For example, given two exported
14// symbols _bar and _baz, we will have a trie like this (terminal nodes are
15// marked with an asterisk):
16//
17//              +-+-+
18//              |   | // root node
19//              +-+-+
20//                |
21//                | _ba
22//                |
23//              +-+-+
24//              |   |
25//              +-+-+
26//           r /     \ z
27//            /       \
28//        +-+-+       +-+-+
29//        | * |       | * |
30//        +-+-+       +-+-+
31//
32// More documentation of the format can be found in
33// llvm/tools/obj2yaml/macho2yaml.cpp.
34//
35//===----------------------------------------------------------------------===//
36
37#include "ExportTrie.h"
38#include "Symbols.h"
39
40#include "lld/Common/ErrorHandler.h"
41#include "lld/Common/Memory.h"
42#include "llvm/ADT/Optional.h"
43#include "llvm/BinaryFormat/MachO.h"
44#include "llvm/Support/LEB128.h"
45
46using namespace llvm;
47using namespace llvm::MachO;
48using namespace lld;
49using namespace lld::macho;
50
51namespace {
52
53struct Edge {
54  Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
55
56  StringRef substring;
57  struct TrieNode *child;
58};
59
60struct ExportInfo {
61  uint64_t address;
62  // TODO: Add proper support for re-exports & stub-and-resolver flags.
63};
64
65} // namespace
66
67struct macho::TrieNode {
68  std::vector<Edge> edges;
69  Optional<ExportInfo> info;
70  // Estimated offset from the start of the serialized trie to the current node.
71  // This will converge to the true offset when updateOffset() is run to a
72  // fixpoint.
73  size_t offset = 0;
74
75  // Returns whether the new estimated offset differs from the old one.
76  bool updateOffset(size_t &nextOffset);
77  void writeTo(uint8_t *buf) const;
78};
79
80bool TrieNode::updateOffset(size_t &nextOffset) {
81  // Size of the whole node (including the terminalSize and the outgoing edges.)
82  // In contrast, terminalSize only records the size of the other data in the
83  // node.
84  size_t nodeSize;
85  if (info) {
86    uint64_t flags = 0;
87    uint32_t terminalSize =
88        getULEB128Size(flags) + getULEB128Size(info->address);
89    // Overall node size so far is the uleb128 size of the length of the symbol
90    // info + the symbol info itself.
91    nodeSize = terminalSize + getULEB128Size(terminalSize);
92  } else {
93    nodeSize = 1; // Size of terminalSize (which has a value of 0)
94  }
95  // Compute size of all child edges.
96  ++nodeSize; // Byte for number of children.
97  for (Edge &edge : edges) {
98    nodeSize += edge.substring.size() + 1             // String length.
99                + getULEB128Size(edge.child->offset); // Offset len.
100  }
101  // On input, 'nextOffset' is the new preferred location for this node.
102  bool result = (offset != nextOffset);
103  // Store new location in node object for use by parents.
104  offset = nextOffset;
105  nextOffset += nodeSize;
106  return result;
107}
108
109void TrieNode::writeTo(uint8_t *buf) const {
110  buf += offset;
111  if (info) {
112    // TrieNodes with Symbol info: size, flags address
113    uint64_t flags = 0; // TODO: emit proper flags
114    uint32_t terminalSize =
115        getULEB128Size(flags) + getULEB128Size(info->address);
116    buf += encodeULEB128(terminalSize, buf);
117    buf += encodeULEB128(flags, buf);
118    buf += encodeULEB128(info->address, buf);
119  } else {
120    // TrieNode with no Symbol info.
121    *buf++ = 0; // terminalSize
122  }
123  // Add number of children. TODO: Handle case where we have more than 256.
124  assert(edges.size() < 256);
125  *buf++ = edges.size();
126  // Append each child edge substring and node offset.
127  for (const Edge &edge : edges) {
128    memcpy(buf, edge.substring.data(), edge.substring.size());
129    buf += edge.substring.size();
130    *buf++ = '\0';
131    buf += encodeULEB128(edge.child->offset, buf);
132  }
133}
134
135TrieNode *TrieBuilder::makeNode() {
136  auto *node = make<TrieNode>();
137  nodes.emplace_back(node);
138  return node;
139}
140
141static int charAt(const Symbol *sym, size_t pos) {
142  StringRef str = sym->getName();
143  if (pos >= str.size())
144    return -1;
145  return str[pos];
146}
147
148// Build the trie by performing a three-way radix quicksort: We start by sorting
149// the strings by their first characters, then sort the strings with the same
150// first characters by their second characters, and so on recursively. Each
151// time the prefixes diverge, we add a node to the trie.
152//
153// node:    The most recently created node along this path in the trie (i.e.
154//          the furthest from the root.)
155// lastPos: The prefix length of the most recently created node, i.e. the number
156//          of characters along its path from the root.
157// pos:     The string index we are currently sorting on. Note that each symbol
158//          S contained in vec has the same prefix S[0...pos).
159void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
160                               TrieNode *node, size_t lastPos, size_t pos) {
161tailcall:
162  if (vec.empty())
163    return;
164
165  // Partition items so that items in [0, i) are less than the pivot,
166  // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
167  // the pivot.
168  const Symbol *pivotSymbol = vec[vec.size() / 2];
169  int pivot = charAt(pivotSymbol, pos);
170  size_t i = 0;
171  size_t j = vec.size();
172  for (size_t k = 0; k < j;) {
173    int c = charAt(vec[k], pos);
174    if (c < pivot)
175      std::swap(vec[i++], vec[k++]);
176    else if (c > pivot)
177      std::swap(vec[--j], vec[k]);
178    else
179      k++;
180  }
181
182  bool isTerminal = pivot == -1;
183  bool prefixesDiverge = i != 0 || j != vec.size();
184  if (lastPos != pos && (isTerminal || prefixesDiverge)) {
185    TrieNode *newNode = makeNode();
186    node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
187                             newNode);
188    node = newNode;
189    lastPos = pos;
190  }
191
192  sortAndBuild(vec.slice(0, i), node, lastPos, pos);
193  sortAndBuild(vec.slice(j), node, lastPos, pos);
194
195  if (isTerminal) {
196    assert(j - i == 1); // no duplicate symbols
197    node->info = {pivotSymbol->getVA()};
198  } else {
199    // This is the tail-call-optimized version of the following:
200    // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
201    vec = vec.slice(i, j - i);
202    ++pos;
203    goto tailcall;
204  }
205}
206
207size_t TrieBuilder::build() {
208  if (exported.empty())
209    return 0;
210
211  TrieNode *root = makeNode();
212  sortAndBuild(exported, root, 0, 0);
213
214  // Assign each node in the vector an offset in the trie stream, iterating
215  // until all uleb128 sizes have stabilized.
216  size_t offset;
217  bool more;
218  do {
219    offset = 0;
220    more = false;
221    for (TrieNode *node : nodes)
222      more |= node->updateOffset(offset);
223  } while (more);
224
225  return offset;
226}
227
228void TrieBuilder::writeTo(uint8_t *buf) const {
229  for (TrieNode *node : nodes)
230    node->writeTo(buf);
231}
232
233namespace {
234
235// Parse a serialized trie and invoke a callback for each entry.
236class TrieParser {
237public:
238  TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback)
239      : start(buf), end(start + size), callback(callback) {}
240
241  void parse(const uint8_t *buf, const Twine &cumulativeString);
242
243  void parse() { parse(start, ""); }
244
245  const uint8_t *start;
246  const uint8_t *end;
247  const TrieEntryCallback &callback;
248};
249
250} // namespace
251
252void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) {
253  if (buf >= end)
254    fatal("Node offset points outside export section");
255
256  unsigned ulebSize;
257  uint64_t terminalSize = decodeULEB128(buf, &ulebSize);
258  buf += ulebSize;
259  uint64_t flags = 0;
260  size_t offset;
261  if (terminalSize != 0) {
262    flags = decodeULEB128(buf, &ulebSize);
263    callback(cumulativeString, flags);
264  }
265  buf += terminalSize;
266  uint8_t numEdges = *buf++;
267  for (uint8_t i = 0; i < numEdges; ++i) {
268    const char *cbuf = reinterpret_cast<const char *>(buf);
269    StringRef substring = StringRef(cbuf, strnlen(cbuf, end - buf));
270    buf += substring.size() + 1;
271    offset = decodeULEB128(buf, &ulebSize);
272    buf += ulebSize;
273    parse(start + offset, cumulativeString + substring);
274  }
275}
276
277void macho::parseTrie(const uint8_t *buf, size_t size,
278                      const TrieEntryCallback &callback) {
279  if (size == 0)
280    return;
281
282  TrieParser(buf, size, callback).parse();
283}
284