1//===- Object.h - Mach-O object file model ----------------------*- C++ -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9#ifndef LLVM_OBJCOPY_MACHO_OBJECT_H 10#define LLVM_OBJCOPY_MACHO_OBJECT_H 11 12#include "llvm/ADT/Optional.h" 13#include "llvm/ADT/StringRef.h" 14#include "llvm/BinaryFormat/MachO.h" 15#include "llvm/MC/StringTableBuilder.h" 16#include "llvm/ObjectYAML/DWARFYAML.h" 17#include "llvm/Support/StringSaver.h" 18#include "llvm/Support/YAMLTraits.h" 19#include <cstdint> 20#include <string> 21#include <vector> 22 23namespace llvm { 24namespace objcopy { 25namespace macho { 26 27struct MachHeader { 28 uint32_t Magic; 29 uint32_t CPUType; 30 uint32_t CPUSubType; 31 uint32_t FileType; 32 uint32_t NCmds; 33 uint32_t SizeOfCmds; 34 uint32_t Flags; 35 uint32_t Reserved = 0; 36}; 37 38struct RelocationInfo; 39struct Section { 40 uint32_t Index; 41 std::string Segname; 42 std::string Sectname; 43 // CanonicalName is a string formatted as ���<Segname>,<Sectname>". 44 std::string CanonicalName; 45 uint64_t Addr = 0; 46 uint64_t Size = 0; 47 uint32_t Offset = 0; 48 uint32_t Align = 0; 49 uint32_t RelOff = 0; 50 uint32_t NReloc = 0; 51 uint32_t Flags = 0; 52 uint32_t Reserved1 = 0; 53 uint32_t Reserved2 = 0; 54 uint32_t Reserved3 = 0; 55 StringRef Content; 56 std::vector<RelocationInfo> Relocations; 57 58 Section(StringRef SegName, StringRef SectName) 59 : Segname(std::string(SegName)), Sectname(std::string(SectName)), 60 CanonicalName((Twine(SegName) + Twine(',') + SectName).str()) {} 61 62 Section(StringRef SegName, StringRef SectName, StringRef Content) 63 : Segname(std::string(SegName)), Sectname(std::string(SectName)), 64 CanonicalName((Twine(SegName) + Twine(',') + SectName).str()), 65 Content(Content) {} 66 67 MachO::SectionType getType() const { 68 return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE); 69 } 70 71 bool isVirtualSection() const { 72 return (getType() == MachO::S_ZEROFILL || 73 getType() == MachO::S_GB_ZEROFILL || 74 getType() == MachO::S_THREAD_LOCAL_ZEROFILL); 75 } 76}; 77 78struct LoadCommand { 79 // The type MachO::macho_load_command is defined in llvm/BinaryFormat/MachO.h 80 // and it is a union of all the structs corresponding to various load 81 // commands. 82 MachO::macho_load_command MachOLoadCommand; 83 84 // The raw content of the payload of the load command (located right after the 85 // corresponding struct). In some cases it is either empty or can be 86 // copied-over without digging into its structure. 87 std::vector<uint8_t> Payload; 88 89 // Some load commands can contain (inside the payload) an array of sections, 90 // though the contents of the sections are stored separately. The struct 91 // Section describes only sections' metadata and where to find the 92 // corresponding content inside the binary. 93 std::vector<std::unique_ptr<Section>> Sections; 94 95 // Returns the segment name if the load command is a segment command. 96 Optional<StringRef> getSegmentName() const; 97}; 98 99// A symbol information. Fields which starts with "n_" are same as them in the 100// nlist. 101struct SymbolEntry { 102 std::string Name; 103 bool Referenced = false; 104 uint32_t Index; 105 uint8_t n_type; 106 uint8_t n_sect; 107 uint16_t n_desc; 108 uint64_t n_value; 109 110 bool isExternalSymbol() const { return n_type & MachO::N_EXT; } 111 112 bool isLocalSymbol() const { return !isExternalSymbol(); } 113 114 bool isUndefinedSymbol() const { 115 return (n_type & MachO::N_TYPE) == MachO::N_UNDF; 116 } 117 118 bool isSwiftSymbol() const { 119 return StringRef(Name).startswith("_$s") || 120 StringRef(Name).startswith("_$S"); 121 } 122 123 Optional<uint32_t> section() const { 124 return n_sect == MachO::NO_SECT ? None : Optional<uint32_t>(n_sect); 125 } 126}; 127 128/// The location of the symbol table inside the binary is described by LC_SYMTAB 129/// load command. 130struct SymbolTable { 131 std::vector<std::unique_ptr<SymbolEntry>> Symbols; 132 133 using iterator = pointee_iterator< 134 std::vector<std::unique_ptr<SymbolEntry>>::const_iterator>; 135 136 iterator begin() const { return iterator(Symbols.begin()); } 137 iterator end() const { return iterator(Symbols.end()); } 138 139 const SymbolEntry *getSymbolByIndex(uint32_t Index) const; 140 SymbolEntry *getSymbolByIndex(uint32_t Index); 141 void removeSymbols( 142 function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove); 143}; 144 145struct IndirectSymbolEntry { 146 // The original value in an indirect symbol table. Higher bits encode extra 147 // information (INDIRECT_SYMBOL_LOCAL and INDIRECT_SYMBOL_ABS). 148 uint32_t OriginalIndex; 149 /// The Symbol referenced by this entry. It's None if the index is 150 /// INDIRECT_SYMBOL_LOCAL or INDIRECT_SYMBOL_ABS. 151 Optional<SymbolEntry *> Symbol; 152 153 IndirectSymbolEntry(uint32_t OriginalIndex, Optional<SymbolEntry *> Symbol) 154 : OriginalIndex(OriginalIndex), Symbol(Symbol) {} 155}; 156 157struct IndirectSymbolTable { 158 std::vector<IndirectSymbolEntry> Symbols; 159}; 160 161/// The location of the string table inside the binary is described by LC_SYMTAB 162/// load command. 163struct StringTable { 164 std::vector<std::string> Strings; 165}; 166 167struct RelocationInfo { 168 // The referenced symbol entry. Set if !Scattered && Extern. 169 Optional<const SymbolEntry *> Symbol; 170 // The referenced section. Set if !Scattered && !Extern. 171 Optional<const Section *> Sec; 172 // True if Info is a scattered_relocation_info. 173 bool Scattered; 174 // True if the r_symbolnum points to a section number (i.e. r_extern=0). 175 bool Extern; 176 MachO::any_relocation_info Info; 177 178 unsigned getPlainRelocationSymbolNum(bool IsLittleEndian) { 179 if (IsLittleEndian) 180 return Info.r_word1 & 0xffffff; 181 return Info.r_word1 >> 8; 182 } 183 184 void setPlainRelocationSymbolNum(unsigned SymbolNum, bool IsLittleEndian) { 185 assert(SymbolNum < (1 << 24) && "SymbolNum out of range"); 186 if (IsLittleEndian) 187 Info.r_word1 = (Info.r_word1 & ~0x00ffffff) | SymbolNum; 188 else 189 Info.r_word1 = (Info.r_word1 & ~0xffffff00) | (SymbolNum << 8); 190 } 191}; 192 193/// The location of the rebase info inside the binary is described by 194/// LC_DYLD_INFO load command. Dyld rebases an image whenever dyld loads it at 195/// an address different from its preferred address. The rebase information is 196/// a stream of byte sized opcodes whose symbolic names start with 197/// REBASE_OPCODE_. Conceptually the rebase information is a table of tuples: 198/// <seg-index, seg-offset, type> 199/// The opcodes are a compressed way to encode the table by only 200/// encoding when a column changes. In addition simple patterns 201/// like "every n'th offset for m times" can be encoded in a few 202/// bytes. 203struct RebaseInfo { 204 // At the moment we do not parse this info (and it is simply copied over), 205 // but the proper support will be added later. 206 ArrayRef<uint8_t> Opcodes; 207}; 208 209/// The location of the bind info inside the binary is described by 210/// LC_DYLD_INFO load command. Dyld binds an image during the loading process, 211/// if the image requires any pointers to be initialized to symbols in other 212/// images. The bind information is a stream of byte sized opcodes whose 213/// symbolic names start with BIND_OPCODE_. Conceptually the bind information is 214/// a table of tuples: <seg-index, seg-offset, type, symbol-library-ordinal, 215/// symbol-name, addend> The opcodes are a compressed way to encode the table by 216/// only encoding when a column changes. In addition simple patterns like for 217/// runs of pointers initialized to the same value can be encoded in a few 218/// bytes. 219struct BindInfo { 220 // At the moment we do not parse this info (and it is simply copied over), 221 // but the proper support will be added later. 222 ArrayRef<uint8_t> Opcodes; 223}; 224 225/// The location of the weak bind info inside the binary is described by 226/// LC_DYLD_INFO load command. Some C++ programs require dyld to unique symbols 227/// so that all images in the process use the same copy of some code/data. This 228/// step is done after binding. The content of the weak_bind info is an opcode 229/// stream like the bind_info. But it is sorted alphabetically by symbol name. 230/// This enable dyld to walk all images with weak binding information in order 231/// and look for collisions. If there are no collisions, dyld does no updating. 232/// That means that some fixups are also encoded in the bind_info. For 233/// instance, all calls to "operator new" are first bound to libstdc++.dylib 234/// using the information in bind_info. Then if some image overrides operator 235/// new that is detected when the weak_bind information is processed and the 236/// call to operator new is then rebound. 237struct WeakBindInfo { 238 // At the moment we do not parse this info (and it is simply copied over), 239 // but the proper support will be added later. 240 ArrayRef<uint8_t> Opcodes; 241}; 242 243/// The location of the lazy bind info inside the binary is described by 244/// LC_DYLD_INFO load command. Some uses of external symbols do not need to be 245/// bound immediately. Instead they can be lazily bound on first use. The 246/// lazy_bind contains a stream of BIND opcodes to bind all lazy symbols. Normal 247/// use is that dyld ignores the lazy_bind section when loading an image. 248/// Instead the static linker arranged for the lazy pointer to initially point 249/// to a helper function which pushes the offset into the lazy_bind area for the 250/// symbol needing to be bound, then jumps to dyld which simply adds the offset 251/// to lazy_bind_off to get the information on what to bind. 252struct LazyBindInfo { 253 ArrayRef<uint8_t> Opcodes; 254}; 255 256/// The location of the export info inside the binary is described by 257/// LC_DYLD_INFO load command. The symbols exported by a dylib are encoded in a 258/// trie. This is a compact representation that factors out common prefixes. It 259/// also reduces LINKEDIT pages in RAM because it encodes all information (name, 260/// address, flags) in one small, contiguous range. The export area is a stream 261/// of nodes. The first node sequentially is the start node for the trie. Nodes 262/// for a symbol start with a uleb128 that is the length of the exported symbol 263/// information for the string so far. If there is no exported symbol, the node 264/// starts with a zero byte. If there is exported info, it follows the length. 265/// First is a uleb128 containing flags. Normally, it is followed by 266/// a uleb128 encoded offset which is location of the content named 267/// by the symbol from the mach_header for the image. If the flags 268/// is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is 269/// a uleb128 encoded library ordinal, then a zero terminated 270/// UTF8 string. If the string is zero length, then the symbol 271/// is re-export from the specified dylib with the same name. 272/// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following 273/// the flags is two uleb128s: the stub offset and the resolver offset. 274/// The stub is used by non-lazy pointers. The resolver is used 275/// by lazy pointers and must be called to get the actual address to use. 276/// After the optional exported symbol information is a byte of 277/// how many edges (0-255) that this node has leaving it, 278/// followed by each edge. 279/// Each edge is a zero terminated UTF8 of the addition chars 280/// in the symbol, followed by a uleb128 offset for the node that 281/// edge points to. 282struct ExportInfo { 283 ArrayRef<uint8_t> Trie; 284}; 285 286struct LinkData { 287 ArrayRef<uint8_t> Data; 288}; 289 290struct Object { 291 MachHeader Header; 292 std::vector<LoadCommand> LoadCommands; 293 294 SymbolTable SymTable; 295 StringTable StrTable; 296 297 RebaseInfo Rebases; 298 BindInfo Binds; 299 WeakBindInfo WeakBinds; 300 LazyBindInfo LazyBinds; 301 ExportInfo Exports; 302 IndirectSymbolTable IndirectSymTable; 303 LinkData DataInCode; 304 LinkData FunctionStarts; 305 LinkData CodeSignature; 306 307 Optional<uint32_t> SwiftVersion; 308 309 /// The index of LC_CODE_SIGNATURE load command if present. 310 Optional<size_t> CodeSignatureCommandIndex; 311 /// The index of LC_SYMTAB load command if present. 312 Optional<size_t> SymTabCommandIndex; 313 /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present. 314 Optional<size_t> DyLdInfoCommandIndex; 315 /// The index LC_DYSYMTAB load comamnd if present. 316 Optional<size_t> DySymTabCommandIndex; 317 /// The index LC_DATA_IN_CODE load comamnd if present. 318 Optional<size_t> DataInCodeCommandIndex; 319 /// The index LC_FUNCTION_STARTS load comamnd if present. 320 Optional<size_t> FunctionStartsCommandIndex; 321 322 BumpPtrAllocator Alloc; 323 StringSaver NewSectionsContents; 324 325 Object() : NewSectionsContents(Alloc) {} 326 327 Error 328 removeSections(function_ref<bool(const std::unique_ptr<Section> &)> ToRemove); 329 330 Error removeLoadCommands(function_ref<bool(const LoadCommand &)> ToRemove); 331 332 void updateLoadCommandIndexes(); 333 334 void addLoadCommand(LoadCommand LC); 335 336 /// Creates a new segment load command in the object and returns a reference 337 /// to the newly created load command. The caller should verify that SegName 338 /// is not too long (SegName.size() should be less than or equal to 16). 339 LoadCommand &addSegment(StringRef SegName); 340 341 bool is64Bit() const { 342 return Header.Magic == MachO::MH_MAGIC_64 || 343 Header.Magic == MachO::MH_CIGAM_64; 344 } 345}; 346 347} // end namespace macho 348} // end namespace objcopy 349} // end namespace llvm 350 351#endif // LLVM_OBJCOPY_MACHO_OBJECT_H 352