1//===- Object.h - Mach-O object file model ----------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_OBJCOPY_MACHO_OBJECT_H
10#define LLVM_OBJCOPY_MACHO_OBJECT_H
11
12#include "llvm/ADT/Optional.h"
13#include "llvm/ADT/StringRef.h"
14#include "llvm/BinaryFormat/MachO.h"
15#include "llvm/MC/StringTableBuilder.h"
16#include "llvm/ObjectYAML/DWARFYAML.h"
17#include "llvm/Support/StringSaver.h"
18#include "llvm/Support/YAMLTraits.h"
19#include <cstdint>
20#include <string>
21#include <vector>
22
23namespace llvm {
24namespace objcopy {
25namespace macho {
26
27struct MachHeader {
28  uint32_t Magic;
29  uint32_t CPUType;
30  uint32_t CPUSubType;
31  uint32_t FileType;
32  uint32_t NCmds;
33  uint32_t SizeOfCmds;
34  uint32_t Flags;
35  uint32_t Reserved = 0;
36};
37
38struct RelocationInfo;
39struct Section {
40  uint32_t Index;
41  std::string Segname;
42  std::string Sectname;
43  // CanonicalName is a string formatted as ���<Segname>,<Sectname>".
44  std::string CanonicalName;
45  uint64_t Addr = 0;
46  uint64_t Size = 0;
47  uint32_t Offset = 0;
48  uint32_t Align = 0;
49  uint32_t RelOff = 0;
50  uint32_t NReloc = 0;
51  uint32_t Flags = 0;
52  uint32_t Reserved1 = 0;
53  uint32_t Reserved2 = 0;
54  uint32_t Reserved3 = 0;
55  StringRef Content;
56  std::vector<RelocationInfo> Relocations;
57
58  Section(StringRef SegName, StringRef SectName)
59      : Segname(std::string(SegName)), Sectname(std::string(SectName)),
60        CanonicalName((Twine(SegName) + Twine(',') + SectName).str()) {}
61
62  Section(StringRef SegName, StringRef SectName, StringRef Content)
63      : Segname(std::string(SegName)), Sectname(std::string(SectName)),
64        CanonicalName((Twine(SegName) + Twine(',') + SectName).str()),
65        Content(Content) {}
66
67  MachO::SectionType getType() const {
68    return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE);
69  }
70
71  bool isVirtualSection() const {
72    return (getType() == MachO::S_ZEROFILL ||
73            getType() == MachO::S_GB_ZEROFILL ||
74            getType() == MachO::S_THREAD_LOCAL_ZEROFILL);
75  }
76};
77
78struct LoadCommand {
79  // The type MachO::macho_load_command is defined in llvm/BinaryFormat/MachO.h
80  // and it is a union of all the structs corresponding to various load
81  // commands.
82  MachO::macho_load_command MachOLoadCommand;
83
84  // The raw content of the payload of the load command (located right after the
85  // corresponding struct). In some cases it is either empty or can be
86  // copied-over without digging into its structure.
87  std::vector<uint8_t> Payload;
88
89  // Some load commands can contain (inside the payload) an array of sections,
90  // though the contents of the sections are stored separately. The struct
91  // Section describes only sections' metadata and where to find the
92  // corresponding content inside the binary.
93  std::vector<std::unique_ptr<Section>> Sections;
94
95  // Returns the segment name if the load command is a segment command.
96  Optional<StringRef> getSegmentName() const;
97};
98
99// A symbol information. Fields which starts with "n_" are same as them in the
100// nlist.
101struct SymbolEntry {
102  std::string Name;
103  bool Referenced = false;
104  uint32_t Index;
105  uint8_t n_type;
106  uint8_t n_sect;
107  uint16_t n_desc;
108  uint64_t n_value;
109
110  bool isExternalSymbol() const { return n_type & MachO::N_EXT; }
111
112  bool isLocalSymbol() const { return !isExternalSymbol(); }
113
114  bool isUndefinedSymbol() const {
115    return (n_type & MachO::N_TYPE) == MachO::N_UNDF;
116  }
117
118  bool isSwiftSymbol() const {
119    return StringRef(Name).startswith("_$s") ||
120           StringRef(Name).startswith("_$S");
121  }
122
123  Optional<uint32_t> section() const {
124    return n_sect == MachO::NO_SECT ? None : Optional<uint32_t>(n_sect);
125  }
126};
127
128/// The location of the symbol table inside the binary is described by LC_SYMTAB
129/// load command.
130struct SymbolTable {
131  std::vector<std::unique_ptr<SymbolEntry>> Symbols;
132
133  using iterator = pointee_iterator<
134      std::vector<std::unique_ptr<SymbolEntry>>::const_iterator>;
135
136  iterator begin() const { return iterator(Symbols.begin()); }
137  iterator end() const { return iterator(Symbols.end()); }
138
139  const SymbolEntry *getSymbolByIndex(uint32_t Index) const;
140  SymbolEntry *getSymbolByIndex(uint32_t Index);
141  void removeSymbols(
142      function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove);
143};
144
145struct IndirectSymbolEntry {
146  // The original value in an indirect symbol table. Higher bits encode extra
147  // information (INDIRECT_SYMBOL_LOCAL and INDIRECT_SYMBOL_ABS).
148  uint32_t OriginalIndex;
149  /// The Symbol referenced by this entry. It's None if the index is
150  /// INDIRECT_SYMBOL_LOCAL or INDIRECT_SYMBOL_ABS.
151  Optional<SymbolEntry *> Symbol;
152
153  IndirectSymbolEntry(uint32_t OriginalIndex, Optional<SymbolEntry *> Symbol)
154      : OriginalIndex(OriginalIndex), Symbol(Symbol) {}
155};
156
157struct IndirectSymbolTable {
158  std::vector<IndirectSymbolEntry> Symbols;
159};
160
161/// The location of the string table inside the binary is described by LC_SYMTAB
162/// load command.
163struct StringTable {
164  std::vector<std::string> Strings;
165};
166
167struct RelocationInfo {
168  // The referenced symbol entry. Set if !Scattered && Extern.
169  Optional<const SymbolEntry *> Symbol;
170  // The referenced section. Set if !Scattered && !Extern.
171  Optional<const Section *> Sec;
172  // True if Info is a scattered_relocation_info.
173  bool Scattered;
174  // True if the r_symbolnum points to a section number (i.e. r_extern=0).
175  bool Extern;
176  MachO::any_relocation_info Info;
177
178  unsigned getPlainRelocationSymbolNum(bool IsLittleEndian) {
179    if (IsLittleEndian)
180      return Info.r_word1 & 0xffffff;
181    return Info.r_word1 >> 8;
182  }
183
184  void setPlainRelocationSymbolNum(unsigned SymbolNum, bool IsLittleEndian) {
185    assert(SymbolNum < (1 << 24) && "SymbolNum out of range");
186    if (IsLittleEndian)
187      Info.r_word1 = (Info.r_word1 & ~0x00ffffff) | SymbolNum;
188    else
189      Info.r_word1 = (Info.r_word1 & ~0xffffff00) | (SymbolNum << 8);
190  }
191};
192
193/// The location of the rebase info inside the binary is described by
194/// LC_DYLD_INFO load command. Dyld rebases an image whenever dyld loads it at
195/// an address different from its preferred address.  The rebase information is
196/// a stream of byte sized opcodes whose symbolic names start with
197/// REBASE_OPCODE_. Conceptually the rebase information is a table of tuples:
198///   <seg-index, seg-offset, type>
199/// The opcodes are a compressed way to encode the table by only
200/// encoding when a column changes.  In addition simple patterns
201/// like "every n'th offset for m times" can be encoded in a few
202/// bytes.
203struct RebaseInfo {
204  // At the moment we do not parse this info (and it is simply copied over),
205  // but the proper support will be added later.
206  ArrayRef<uint8_t> Opcodes;
207};
208
209/// The location of the bind info inside the binary is described by
210/// LC_DYLD_INFO load command. Dyld binds an image during the loading process,
211/// if the image requires any pointers to be initialized to symbols in other
212/// images. The bind information is a stream of byte sized opcodes whose
213/// symbolic names start with BIND_OPCODE_. Conceptually the bind information is
214/// a table of tuples: <seg-index, seg-offset, type, symbol-library-ordinal,
215/// symbol-name, addend> The opcodes are a compressed way to encode the table by
216/// only encoding when a column changes.  In addition simple patterns like for
217/// runs of pointers initialized to the same value can be encoded in a few
218/// bytes.
219struct BindInfo {
220  // At the moment we do not parse this info (and it is simply copied over),
221  // but the proper support will be added later.
222  ArrayRef<uint8_t> Opcodes;
223};
224
225/// The location of the weak bind info inside the binary is described by
226/// LC_DYLD_INFO load command. Some C++ programs require dyld to unique symbols
227/// so that all images in the process use the same copy of some code/data. This
228/// step is done after binding. The content of the weak_bind info is an opcode
229/// stream like the bind_info.  But it is sorted alphabetically by symbol name.
230/// This enable dyld to walk all images with weak binding information in order
231/// and look for collisions.  If there are no collisions, dyld does no updating.
232/// That means that some fixups are also encoded in the bind_info.  For
233/// instance, all calls to "operator new" are first bound to libstdc++.dylib
234/// using the information in bind_info.  Then if some image overrides operator
235/// new that is detected when the weak_bind information is processed and the
236/// call to operator new is then rebound.
237struct WeakBindInfo {
238  // At the moment we do not parse this info (and it is simply copied over),
239  // but the proper support will be added later.
240  ArrayRef<uint8_t> Opcodes;
241};
242
243/// The location of the lazy bind info inside the binary is described by
244/// LC_DYLD_INFO load command. Some uses of external symbols do not need to be
245/// bound immediately. Instead they can be lazily bound on first use.  The
246/// lazy_bind contains a stream of BIND opcodes to bind all lazy symbols. Normal
247/// use is that dyld ignores the lazy_bind section when loading an image.
248/// Instead the static linker arranged for the lazy pointer to initially point
249/// to a helper function which pushes the offset into the lazy_bind area for the
250/// symbol needing to be bound, then jumps to dyld which simply adds the offset
251/// to lazy_bind_off to get the information on what to bind.
252struct LazyBindInfo {
253  ArrayRef<uint8_t> Opcodes;
254};
255
256/// The location of the export info inside the binary is described by
257/// LC_DYLD_INFO load command. The symbols exported by a dylib are encoded in a
258/// trie.  This is a compact representation that factors out common prefixes. It
259/// also reduces LINKEDIT pages in RAM because it encodes all information (name,
260/// address, flags) in one small, contiguous range. The export area is a stream
261/// of nodes.  The first node sequentially is the start node for the trie. Nodes
262/// for a symbol start with a uleb128 that is the length of the exported symbol
263/// information for the string so far. If there is no exported symbol, the node
264/// starts with a zero byte. If there is exported info, it follows the length.
265/// First is a uleb128 containing flags. Normally, it is followed by
266/// a uleb128 encoded offset which is location of the content named
267/// by the symbol from the mach_header for the image.  If the flags
268/// is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is
269/// a uleb128 encoded library ordinal, then a zero terminated
270/// UTF8 string.  If the string is zero length, then the symbol
271/// is re-export from the specified dylib with the same name.
272/// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following
273/// the flags is two uleb128s: the stub offset and the resolver offset.
274/// The stub is used by non-lazy pointers.  The resolver is used
275/// by lazy pointers and must be called to get the actual address to use.
276/// After the optional exported symbol information is a byte of
277/// how many edges (0-255) that this node has leaving it,
278/// followed by each edge.
279/// Each edge is a zero terminated UTF8 of the addition chars
280/// in the symbol, followed by a uleb128 offset for the node that
281/// edge points to.
282struct ExportInfo {
283  ArrayRef<uint8_t> Trie;
284};
285
286struct LinkData {
287  ArrayRef<uint8_t> Data;
288};
289
290struct Object {
291  MachHeader Header;
292  std::vector<LoadCommand> LoadCommands;
293
294  SymbolTable SymTable;
295  StringTable StrTable;
296
297  RebaseInfo Rebases;
298  BindInfo Binds;
299  WeakBindInfo WeakBinds;
300  LazyBindInfo LazyBinds;
301  ExportInfo Exports;
302  IndirectSymbolTable IndirectSymTable;
303  LinkData DataInCode;
304  LinkData FunctionStarts;
305  LinkData CodeSignature;
306
307  Optional<uint32_t> SwiftVersion;
308
309  /// The index of LC_CODE_SIGNATURE load command if present.
310  Optional<size_t> CodeSignatureCommandIndex;
311  /// The index of LC_SYMTAB load command if present.
312  Optional<size_t> SymTabCommandIndex;
313  /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present.
314  Optional<size_t> DyLdInfoCommandIndex;
315  /// The index LC_DYSYMTAB load comamnd if present.
316  Optional<size_t> DySymTabCommandIndex;
317  /// The index LC_DATA_IN_CODE load comamnd if present.
318  Optional<size_t> DataInCodeCommandIndex;
319  /// The index LC_FUNCTION_STARTS load comamnd if present.
320  Optional<size_t> FunctionStartsCommandIndex;
321
322  BumpPtrAllocator Alloc;
323  StringSaver NewSectionsContents;
324
325  Object() : NewSectionsContents(Alloc) {}
326
327  Error
328  removeSections(function_ref<bool(const std::unique_ptr<Section> &)> ToRemove);
329
330  Error removeLoadCommands(function_ref<bool(const LoadCommand &)> ToRemove);
331
332  void updateLoadCommandIndexes();
333
334  void addLoadCommand(LoadCommand LC);
335
336  /// Creates a new segment load command in the object and returns a reference
337  /// to the newly created load command. The caller should verify that SegName
338  /// is not too long (SegName.size() should be less than or equal to 16).
339  LoadCommand &addSegment(StringRef SegName);
340
341  bool is64Bit() const {
342    return Header.Magic == MachO::MH_MAGIC_64 ||
343           Header.Magic == MachO::MH_CIGAM_64;
344  }
345};
346
347} // end namespace macho
348} // end namespace objcopy
349} // end namespace llvm
350
351#endif // LLVM_OBJCOPY_MACHO_OBJECT_H
352