1//===- Object.h - Mach-O object file model ----------------------*- C++ -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9#ifndef LLVM_OBJCOPY_MACHO_OBJECT_H 10#define LLVM_OBJCOPY_MACHO_OBJECT_H 11 12#include "llvm/ADT/Optional.h" 13#include "llvm/ADT/StringRef.h" 14#include "llvm/BinaryFormat/MachO.h" 15#include "llvm/MC/StringTableBuilder.h" 16#include "llvm/ObjectYAML/DWARFYAML.h" 17#include "llvm/Support/StringSaver.h" 18#include "llvm/Support/YAMLTraits.h" 19#include <cstdint> 20#include <string> 21#include <vector> 22 23namespace llvm { 24namespace objcopy { 25namespace macho { 26 27struct MachHeader { 28 uint32_t Magic; 29 uint32_t CPUType; 30 uint32_t CPUSubType; 31 uint32_t FileType; 32 uint32_t NCmds; 33 uint32_t SizeOfCmds; 34 uint32_t Flags; 35 uint32_t Reserved = 0; 36}; 37 38struct RelocationInfo; 39struct Section { 40 std::string Segname; 41 std::string Sectname; 42 // CanonicalName is a string formatted as ���<Segname>,<Sectname>". 43 std::string CanonicalName; 44 uint64_t Addr = 0; 45 uint64_t Size = 0; 46 uint32_t Offset = 0; 47 uint32_t Align = 0; 48 uint32_t RelOff = 0; 49 uint32_t NReloc = 0; 50 uint32_t Flags = 0; 51 uint32_t Reserved1 = 0; 52 uint32_t Reserved2 = 0; 53 uint32_t Reserved3 = 0; 54 StringRef Content; 55 std::vector<RelocationInfo> Relocations; 56 57 Section(StringRef SegName, StringRef SectName) 58 : Segname(SegName), Sectname(SectName), 59 CanonicalName((Twine(SegName) + Twine(',') + SectName).str()) {} 60 61 Section(StringRef SegName, StringRef SectName, StringRef Content) 62 : Segname(SegName), Sectname(SectName), 63 CanonicalName((Twine(SegName) + Twine(',') + SectName).str()), 64 Content(Content) {} 65 66 MachO::SectionType getType() const { 67 return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE); 68 } 69 70 bool isVirtualSection() const { 71 return (getType() == MachO::S_ZEROFILL || 72 getType() == MachO::S_GB_ZEROFILL || 73 getType() == MachO::S_THREAD_LOCAL_ZEROFILL); 74 } 75}; 76 77struct LoadCommand { 78 // The type MachO::macho_load_command is defined in llvm/BinaryFormat/MachO.h 79 // and it is a union of all the structs corresponding to various load 80 // commands. 81 MachO::macho_load_command MachOLoadCommand; 82 83 // The raw content of the payload of the load command (located right after the 84 // corresponding struct). In some cases it is either empty or can be 85 // copied-over without digging into its structure. 86 std::vector<uint8_t> Payload; 87 88 // Some load commands can contain (inside the payload) an array of sections, 89 // though the contents of the sections are stored separately. The struct 90 // Section describes only sections' metadata and where to find the 91 // corresponding content inside the binary. 92 std::vector<Section> Sections; 93 94 // Returns the segment name if the load command is a segment command. 95 Optional<StringRef> getSegmentName() const; 96}; 97 98// A symbol information. Fields which starts with "n_" are same as them in the 99// nlist. 100struct SymbolEntry { 101 std::string Name; 102 bool Referenced = false; 103 uint32_t Index; 104 uint8_t n_type; 105 uint8_t n_sect; 106 uint16_t n_desc; 107 uint64_t n_value; 108 109 bool isExternalSymbol() const { 110 return n_type & ((MachO::N_EXT | MachO::N_PEXT)); 111 } 112 113 bool isLocalSymbol() const { return !isExternalSymbol(); } 114 115 bool isUndefinedSymbol() const { 116 return (n_type & MachO::N_TYPE) == MachO::N_UNDF; 117 } 118}; 119 120/// The location of the symbol table inside the binary is described by LC_SYMTAB 121/// load command. 122struct SymbolTable { 123 std::vector<std::unique_ptr<SymbolEntry>> Symbols; 124 125 using iterator = pointee_iterator< 126 std::vector<std::unique_ptr<SymbolEntry>>::const_iterator>; 127 128 iterator begin() const { return iterator(Symbols.begin()); } 129 iterator end() const { return iterator(Symbols.end()); } 130 131 const SymbolEntry *getSymbolByIndex(uint32_t Index) const; 132 SymbolEntry *getSymbolByIndex(uint32_t Index); 133 void removeSymbols( 134 function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove); 135}; 136 137struct IndirectSymbolEntry { 138 // The original value in an indirect symbol table. Higher bits encode extra 139 // information (INDIRECT_SYMBOL_LOCAL and INDIRECT_SYMBOL_ABS). 140 uint32_t OriginalIndex; 141 /// The Symbol referenced by this entry. It's None if the index is 142 /// INDIRECT_SYMBOL_LOCAL or INDIRECT_SYMBOL_ABS. 143 Optional<SymbolEntry *> Symbol; 144 145 IndirectSymbolEntry(uint32_t OriginalIndex, Optional<SymbolEntry *> Symbol) 146 : OriginalIndex(OriginalIndex), Symbol(Symbol) {} 147}; 148 149struct IndirectSymbolTable { 150 std::vector<IndirectSymbolEntry> Symbols; 151}; 152 153/// The location of the string table inside the binary is described by LC_SYMTAB 154/// load command. 155struct StringTable { 156 std::vector<std::string> Strings; 157}; 158 159struct RelocationInfo { 160 const SymbolEntry *Symbol; 161 // True if Info is a scattered_relocation_info. 162 bool Scattered; 163 MachO::any_relocation_info Info; 164}; 165 166/// The location of the rebase info inside the binary is described by 167/// LC_DYLD_INFO load command. Dyld rebases an image whenever dyld loads it at 168/// an address different from its preferred address. The rebase information is 169/// a stream of byte sized opcodes whose symbolic names start with 170/// REBASE_OPCODE_. Conceptually the rebase information is a table of tuples: 171/// <seg-index, seg-offset, type> 172/// The opcodes are a compressed way to encode the table by only 173/// encoding when a column changes. In addition simple patterns 174/// like "every n'th offset for m times" can be encoded in a few 175/// bytes. 176struct RebaseInfo { 177 // At the moment we do not parse this info (and it is simply copied over), 178 // but the proper support will be added later. 179 ArrayRef<uint8_t> Opcodes; 180}; 181 182/// The location of the bind info inside the binary is described by 183/// LC_DYLD_INFO load command. Dyld binds an image during the loading process, 184/// if the image requires any pointers to be initialized to symbols in other 185/// images. The bind information is a stream of byte sized opcodes whose 186/// symbolic names start with BIND_OPCODE_. Conceptually the bind information is 187/// a table of tuples: <seg-index, seg-offset, type, symbol-library-ordinal, 188/// symbol-name, addend> The opcodes are a compressed way to encode the table by 189/// only encoding when a column changes. In addition simple patterns like for 190/// runs of pointers initialized to the same value can be encoded in a few 191/// bytes. 192struct BindInfo { 193 // At the moment we do not parse this info (and it is simply copied over), 194 // but the proper support will be added later. 195 ArrayRef<uint8_t> Opcodes; 196}; 197 198/// The location of the weak bind info inside the binary is described by 199/// LC_DYLD_INFO load command. Some C++ programs require dyld to unique symbols 200/// so that all images in the process use the same copy of some code/data. This 201/// step is done after binding. The content of the weak_bind info is an opcode 202/// stream like the bind_info. But it is sorted alphabetically by symbol name. 203/// This enable dyld to walk all images with weak binding information in order 204/// and look for collisions. If there are no collisions, dyld does no updating. 205/// That means that some fixups are also encoded in the bind_info. For 206/// instance, all calls to "operator new" are first bound to libstdc++.dylib 207/// using the information in bind_info. Then if some image overrides operator 208/// new that is detected when the weak_bind information is processed and the 209/// call to operator new is then rebound. 210struct WeakBindInfo { 211 // At the moment we do not parse this info (and it is simply copied over), 212 // but the proper support will be added later. 213 ArrayRef<uint8_t> Opcodes; 214}; 215 216/// The location of the lazy bind info inside the binary is described by 217/// LC_DYLD_INFO load command. Some uses of external symbols do not need to be 218/// bound immediately. Instead they can be lazily bound on first use. The 219/// lazy_bind contains a stream of BIND opcodes to bind all lazy symbols. Normal 220/// use is that dyld ignores the lazy_bind section when loading an image. 221/// Instead the static linker arranged for the lazy pointer to initially point 222/// to a helper function which pushes the offset into the lazy_bind area for the 223/// symbol needing to be bound, then jumps to dyld which simply adds the offset 224/// to lazy_bind_off to get the information on what to bind. 225struct LazyBindInfo { 226 ArrayRef<uint8_t> Opcodes; 227}; 228 229/// The location of the export info inside the binary is described by 230/// LC_DYLD_INFO load command. The symbols exported by a dylib are encoded in a 231/// trie. This is a compact representation that factors out common prefixes. It 232/// also reduces LINKEDIT pages in RAM because it encodes all information (name, 233/// address, flags) in one small, contiguous range. The export area is a stream 234/// of nodes. The first node sequentially is the start node for the trie. Nodes 235/// for a symbol start with a uleb128 that is the length of the exported symbol 236/// information for the string so far. If there is no exported symbol, the node 237/// starts with a zero byte. If there is exported info, it follows the length. 238/// First is a uleb128 containing flags. Normally, it is followed by 239/// a uleb128 encoded offset which is location of the content named 240/// by the symbol from the mach_header for the image. If the flags 241/// is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is 242/// a uleb128 encoded library ordinal, then a zero terminated 243/// UTF8 string. If the string is zero length, then the symbol 244/// is re-export from the specified dylib with the same name. 245/// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following 246/// the flags is two uleb128s: the stub offset and the resolver offset. 247/// The stub is used by non-lazy pointers. The resolver is used 248/// by lazy pointers and must be called to get the actual address to use. 249/// After the optional exported symbol information is a byte of 250/// how many edges (0-255) that this node has leaving it, 251/// followed by each edge. 252/// Each edge is a zero terminated UTF8 of the addition chars 253/// in the symbol, followed by a uleb128 offset for the node that 254/// edge points to. 255struct ExportInfo { 256 ArrayRef<uint8_t> Trie; 257}; 258 259struct LinkData { 260 ArrayRef<uint8_t> Data; 261}; 262 263struct Object { 264 MachHeader Header; 265 std::vector<LoadCommand> LoadCommands; 266 267 SymbolTable SymTable; 268 StringTable StrTable; 269 270 RebaseInfo Rebases; 271 BindInfo Binds; 272 WeakBindInfo WeakBinds; 273 LazyBindInfo LazyBinds; 274 ExportInfo Exports; 275 IndirectSymbolTable IndirectSymTable; 276 LinkData DataInCode; 277 LinkData FunctionStarts; 278 279 /// The index of LC_SYMTAB load command if present. 280 Optional<size_t> SymTabCommandIndex; 281 /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present. 282 Optional<size_t> DyLdInfoCommandIndex; 283 /// The index LC_DYSYMTAB load comamnd if present. 284 Optional<size_t> DySymTabCommandIndex; 285 /// The index LC_DATA_IN_CODE load comamnd if present. 286 Optional<size_t> DataInCodeCommandIndex; 287 /// The index LC_FUNCTION_STARTS load comamnd if present. 288 Optional<size_t> FunctionStartsCommandIndex; 289 290 BumpPtrAllocator Alloc; 291 StringSaver NewSectionsContents; 292 293 Object() : NewSectionsContents(Alloc) {} 294 295 void removeSections(function_ref<bool(const Section &)> ToRemove); 296 void addLoadCommand(LoadCommand LC); 297 298 /// Creates a new segment load command in the object and returns a reference 299 /// to the newly created load command. The caller should verify that SegName 300 /// is not too long (SegName.size() should be less than or equal to 16). 301 LoadCommand &addSegment(StringRef SegName); 302 303 bool is64Bit() const { 304 return Header.Magic == MachO::MH_MAGIC_64 || 305 Header.Magic == MachO::MH_CIGAM_64; 306 } 307}; 308 309} // end namespace macho 310} // end namespace objcopy 311} // end namespace llvm 312 313#endif // LLVM_OBJCOPY_MACHO_OBJECT_H 314