1//===-- ArchiveReader.cpp - Read LLVM archive files -------------*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// Builds up standard unix archive files (.a) containing LLVM bitcode.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ArchiveInternals.h"
15#include "llvm/ADT/SmallPtrSet.h"
16#include "llvm/Bitcode/ReaderWriter.h"
17#include "llvm/Support/MemoryBuffer.h"
18#include "llvm/Module.h"
19#include <cstdio>
20#include <cstdlib>
21#include <memory>
22using namespace llvm;
23
24/// Read a variable-bit-rate encoded unsigned integer
25static inline unsigned readInteger(const char*&At, const char*End) {
26  unsigned Shift = 0;
27  unsigned Result = 0;
28
29  do {
30    if (At == End)
31      return Result;
32    Result |= (unsigned)((*At++) & 0x7F) << Shift;
33    Shift += 7;
34  } while (At[-1] & 0x80);
35  return Result;
36}
37
38// Completely parse the Archive's symbol table and populate symTab member var.
39bool
40Archive::parseSymbolTable(const void* data, unsigned size, std::string* error) {
41  const char* At = (const char*) data;
42  const char* End = At + size;
43  while (At < End) {
44    unsigned offset = readInteger(At, End);
45    if (At == End) {
46      if (error)
47        *error = "Ran out of data reading vbr_uint for symtab offset!";
48      return false;
49    }
50    unsigned length = readInteger(At, End);
51    if (At == End) {
52      if (error)
53        *error = "Ran out of data reading vbr_uint for symtab length!";
54      return false;
55    }
56    if (At + length > End) {
57      if (error)
58        *error = "Malformed symbol table: length not consistent with size";
59      return false;
60    }
61    // we don't care if it can't be inserted (duplicate entry)
62    symTab.insert(std::make_pair(std::string(At, length), offset));
63    At += length;
64  }
65  symTabSize = size;
66  return true;
67}
68
69// This member parses an ArchiveMemberHeader that is presumed to be pointed to
70// by At. The At pointer is updated to the byte just after the header, which
71// can be variable in size.
72ArchiveMember*
73Archive::parseMemberHeader(const char*& At, const char* End, std::string* error)
74{
75  if (At + sizeof(ArchiveMemberHeader) >= End) {
76    if (error)
77      *error = "Unexpected end of file";
78    return 0;
79  }
80
81  // Cast archive member header
82  const ArchiveMemberHeader* Hdr = (const ArchiveMemberHeader*)At;
83  At += sizeof(ArchiveMemberHeader);
84
85  int flags = 0;
86  int MemberSize = atoi(Hdr->size);
87  assert(MemberSize >= 0);
88
89  // Check the size of the member for sanity
90  if (At + MemberSize > End) {
91    if (error)
92      *error = "invalid member length in archive file";
93    return 0;
94  }
95
96  // Check the member signature
97  if (!Hdr->checkSignature()) {
98    if (error)
99      *error = "invalid file member signature";
100    return 0;
101  }
102
103  // Convert and check the member name
104  // The empty name ( '/' and 15 blanks) is for a foreign (non-LLVM) symbol
105  // table. The special name "//" and 14 blanks is for a string table, used
106  // for long file names. This library doesn't generate either of those but
107  // it will accept them. If the name starts with #1/ and the remainder is
108  // digits, then those digits specify the length of the name that is
109  // stored immediately following the header. The special name
110  // __LLVM_SYM_TAB__ identifies the symbol table for LLVM bitcode.
111  // Anything else is a regular, short filename that is terminated with
112  // a '/' and blanks.
113
114  std::string pathname;
115  switch (Hdr->name[0]) {
116    case '#':
117      if (Hdr->name[1] == '1' && Hdr->name[2] == '/') {
118        if (isdigit(Hdr->name[3])) {
119          unsigned len = atoi(&Hdr->name[3]);
120          const char *nulp = (const char *)memchr(At, '\0', len);
121          pathname.assign(At, nulp != 0 ? (uintptr_t)(nulp - At) : len);
122          At += len;
123          MemberSize -= len;
124          flags |= ArchiveMember::HasLongFilenameFlag;
125        } else {
126          if (error)
127            *error = "invalid long filename";
128          return 0;
129        }
130      } else if (Hdr->name[1] == '_' &&
131                 (0 == memcmp(Hdr->name, ARFILE_LLVM_SYMTAB_NAME, 16))) {
132        // The member is using a long file name (>15 chars) format.
133        // This format is standard for 4.4BSD and Mac OSX operating
134        // systems. LLVM uses it similarly. In this format, the
135        // remainder of the name field (after #1/) specifies the
136        // length of the file name which occupy the first bytes of
137        // the member's data. The pathname already has the #1/ stripped.
138        pathname.assign(ARFILE_LLVM_SYMTAB_NAME);
139        flags |= ArchiveMember::LLVMSymbolTableFlag;
140      }
141      break;
142    case '/':
143      if (Hdr->name[1]== '/') {
144        if (0 == memcmp(Hdr->name, ARFILE_STRTAB_NAME, 16)) {
145          pathname.assign(ARFILE_STRTAB_NAME);
146          flags |= ArchiveMember::StringTableFlag;
147        } else {
148          if (error)
149            *error = "invalid string table name";
150          return 0;
151        }
152      } else if (Hdr->name[1] == ' ') {
153        if (0 == memcmp(Hdr->name, ARFILE_SVR4_SYMTAB_NAME, 16)) {
154          pathname.assign(ARFILE_SVR4_SYMTAB_NAME);
155          flags |= ArchiveMember::SVR4SymbolTableFlag;
156        } else {
157          if (error)
158            *error = "invalid SVR4 symbol table name";
159          return 0;
160        }
161      } else if (isdigit(Hdr->name[1])) {
162        unsigned index = atoi(&Hdr->name[1]);
163        if (index < strtab.length()) {
164          const char* namep = strtab.c_str() + index;
165          const char* endp = strtab.c_str() + strtab.length();
166          const char* p = namep;
167          const char* last_p = p;
168          while (p < endp) {
169            if (*p == '\n' && *last_p == '/') {
170              pathname.assign(namep, last_p - namep);
171              flags |= ArchiveMember::HasLongFilenameFlag;
172              break;
173            }
174            last_p = p;
175            p++;
176          }
177          if (p >= endp) {
178            if (error)
179              *error = "missing name termiantor in string table";
180            return 0;
181          }
182        } else {
183          if (error)
184            *error = "name index beyond string table";
185          return 0;
186        }
187      }
188      break;
189    case '_':
190      if (Hdr->name[1] == '_' &&
191          (0 == memcmp(Hdr->name, ARFILE_BSD4_SYMTAB_NAME, 16))) {
192        pathname.assign(ARFILE_BSD4_SYMTAB_NAME);
193        flags |= ArchiveMember::BSD4SymbolTableFlag;
194        break;
195      }
196      /* FALL THROUGH */
197
198    default:
199      const char* slash = (const char*) memchr(Hdr->name, '/', 16);
200      if (slash == 0)
201        slash = Hdr->name + 16;
202      pathname.assign(Hdr->name, slash - Hdr->name);
203      break;
204  }
205
206  // Determine if this is a bitcode file
207  switch (sys::IdentifyFileType(At, 4)) {
208    case sys::Bitcode_FileType:
209      flags |= ArchiveMember::BitcodeFlag;
210      break;
211    default:
212      flags &= ~ArchiveMember::BitcodeFlag;
213      break;
214  }
215
216  // Instantiate the ArchiveMember to be filled
217  ArchiveMember* member = new ArchiveMember(this);
218
219  // Fill in fields of the ArchiveMember
220  member->parent = this;
221  member->path.set(pathname);
222  member->info.fileSize = MemberSize;
223  member->info.modTime.fromEpochTime(atoi(Hdr->date));
224  unsigned int mode;
225  sscanf(Hdr->mode, "%o", &mode);
226  member->info.mode = mode;
227  member->info.user = atoi(Hdr->uid);
228  member->info.group = atoi(Hdr->gid);
229  member->flags = flags;
230  member->data = At;
231
232  return member;
233}
234
235bool
236Archive::checkSignature(std::string* error) {
237  // Check the magic string at file's header
238  if (mapfile->getBufferSize() < 8 || memcmp(base, ARFILE_MAGIC, 8)) {
239    if (error)
240      *error = "invalid signature for an archive file";
241    return false;
242  }
243  return true;
244}
245
246// This function loads the entire archive and fully populates its ilist with
247// the members of the archive file. This is typically used in preparation for
248// editing the contents of the archive.
249bool
250Archive::loadArchive(std::string* error) {
251
252  // Set up parsing
253  members.clear();
254  symTab.clear();
255  const char *At = base;
256  const char *End = mapfile->getBufferEnd();
257
258  if (!checkSignature(error))
259    return false;
260
261  At += 8;  // Skip the magic string.
262
263  bool seenSymbolTable = false;
264  bool foundFirstFile = false;
265  while (At < End) {
266    // parse the member header
267    const char* Save = At;
268    ArchiveMember* mbr = parseMemberHeader(At, End, error);
269    if (!mbr)
270      return false;
271
272    // check if this is the foreign symbol table
273    if (mbr->isSVR4SymbolTable() || mbr->isBSD4SymbolTable()) {
274      // We just save this but don't do anything special
275      // with it. It doesn't count as the "first file".
276      if (foreignST) {
277        // What? Multiple foreign symbol tables? Just chuck it
278        // and retain the last one found.
279        delete foreignST;
280      }
281      foreignST = mbr;
282      At += mbr->getSize();
283      if ((intptr_t(At) & 1) == 1)
284        At++;
285    } else if (mbr->isStringTable()) {
286      // Simply suck the entire string table into a string
287      // variable. This will be used to get the names of the
288      // members that use the "/ddd" format for their names
289      // (SVR4 style long names).
290      strtab.assign(At, mbr->getSize());
291      At += mbr->getSize();
292      if ((intptr_t(At) & 1) == 1)
293        At++;
294      delete mbr;
295    } else if (mbr->isLLVMSymbolTable()) {
296      // This is the LLVM symbol table for the archive. If we've seen it
297      // already, its an error. Otherwise, parse the symbol table and move on.
298      if (seenSymbolTable) {
299        if (error)
300          *error = "invalid archive: multiple symbol tables";
301        return false;
302      }
303      if (!parseSymbolTable(mbr->getData(), mbr->getSize(), error))
304        return false;
305      seenSymbolTable = true;
306      At += mbr->getSize();
307      if ((intptr_t(At) & 1) == 1)
308        At++;
309      delete mbr; // We don't need this member in the list of members.
310    } else {
311      // This is just a regular file. If its the first one, save its offset.
312      // Otherwise just push it on the list and move on to the next file.
313      if (!foundFirstFile) {
314        firstFileOffset = Save - base;
315        foundFirstFile = true;
316      }
317      members.push_back(mbr);
318      At += mbr->getSize();
319      if ((intptr_t(At) & 1) == 1)
320        At++;
321    }
322  }
323  return true;
324}
325
326// Open and completely load the archive file.
327Archive*
328Archive::OpenAndLoad(const sys::Path& file, LLVMContext& C,
329                     std::string* ErrorMessage) {
330  std::auto_ptr<Archive> result ( new Archive(file, C));
331  if (result->mapToMemory(ErrorMessage))
332    return 0;
333  if (!result->loadArchive(ErrorMessage))
334    return 0;
335  return result.release();
336}
337
338// Get all the bitcode modules from the archive
339bool
340Archive::getAllModules(std::vector<Module*>& Modules,
341                       std::string* ErrMessage) {
342
343  for (iterator I=begin(), E=end(); I != E; ++I) {
344    if (I->isBitcode()) {
345      std::string FullMemberName = archPath.str() +
346        "(" + I->getPath().str() + ")";
347      MemoryBuffer *Buffer =
348        MemoryBuffer::getMemBufferCopy(StringRef(I->getData(), I->getSize()),
349                                       FullMemberName.c_str());
350
351      Module *M = ParseBitcodeFile(Buffer, Context, ErrMessage);
352      delete Buffer;
353      if (!M)
354        return true;
355
356      Modules.push_back(M);
357    }
358  }
359  return false;
360}
361
362// Load just the symbol table from the archive file
363bool
364Archive::loadSymbolTable(std::string* ErrorMsg) {
365
366  // Set up parsing
367  members.clear();
368  symTab.clear();
369  const char *At = base;
370  const char *End = mapfile->getBufferEnd();
371
372  // Make sure we're dealing with an archive
373  if (!checkSignature(ErrorMsg))
374    return false;
375
376  At += 8; // Skip signature
377
378  // Parse the first file member header
379  const char* FirstFile = At;
380  ArchiveMember* mbr = parseMemberHeader(At, End, ErrorMsg);
381  if (!mbr)
382    return false;
383
384  if (mbr->isSVR4SymbolTable() || mbr->isBSD4SymbolTable()) {
385    // Skip the foreign symbol table, we don't do anything with it
386    At += mbr->getSize();
387    if ((intptr_t(At) & 1) == 1)
388      At++;
389    delete mbr;
390
391    // Read the next one
392    FirstFile = At;
393    mbr = parseMemberHeader(At, End, ErrorMsg);
394    if (!mbr) {
395      delete mbr;
396      return false;
397    }
398  }
399
400  if (mbr->isStringTable()) {
401    // Process the string table entry
402    strtab.assign((const char*)mbr->getData(), mbr->getSize());
403    At += mbr->getSize();
404    if ((intptr_t(At) & 1) == 1)
405      At++;
406    delete mbr;
407    // Get the next one
408    FirstFile = At;
409    mbr = parseMemberHeader(At, End, ErrorMsg);
410    if (!mbr) {
411      delete mbr;
412      return false;
413    }
414  }
415
416  // See if its the symbol table
417  if (mbr->isLLVMSymbolTable()) {
418    if (!parseSymbolTable(mbr->getData(), mbr->getSize(), ErrorMsg)) {
419      delete mbr;
420      return false;
421    }
422
423    At += mbr->getSize();
424    if ((intptr_t(At) & 1) == 1)
425      At++;
426    delete mbr;
427    // Can't be any more symtab headers so just advance
428    FirstFile = At;
429  } else {
430    // There's no symbol table in the file. We have to rebuild it from scratch
431    // because the intent of this method is to get the symbol table loaded so
432    // it can be searched efficiently.
433    // Add the member to the members list
434    members.push_back(mbr);
435  }
436
437  firstFileOffset = FirstFile - base;
438  return true;
439}
440
441// Open the archive and load just the symbol tables
442Archive* Archive::OpenAndLoadSymbols(const sys::Path& file,
443                                     LLVMContext& C,
444                                     std::string* ErrorMessage) {
445  std::auto_ptr<Archive> result ( new Archive(file, C) );
446  if (result->mapToMemory(ErrorMessage))
447    return 0;
448  if (!result->loadSymbolTable(ErrorMessage))
449    return 0;
450  return result.release();
451}
452
453// Look up one symbol in the symbol table and return the module that defines
454// that symbol.
455Module*
456Archive::findModuleDefiningSymbol(const std::string& symbol,
457                                  std::string* ErrMsg) {
458  SymTabType::iterator SI = symTab.find(symbol);
459  if (SI == symTab.end())
460    return 0;
461
462  // The symbol table was previously constructed assuming that the members were
463  // written without the symbol table header. Because VBR encoding is used, the
464  // values could not be adjusted to account for the offset of the symbol table
465  // because that could affect the size of the symbol table due to VBR encoding.
466  // We now have to account for this by adjusting the offset by the size of the
467  // symbol table and its header.
468  unsigned fileOffset =
469    SI->second +                // offset in symbol-table-less file
470    firstFileOffset;            // add offset to first "real" file in archive
471
472  // See if the module is already loaded
473  ModuleMap::iterator MI = modules.find(fileOffset);
474  if (MI != modules.end())
475    return MI->second.first;
476
477  // Module hasn't been loaded yet, we need to load it
478  const char* modptr = base + fileOffset;
479  ArchiveMember* mbr = parseMemberHeader(modptr, mapfile->getBufferEnd(),
480                                         ErrMsg);
481  if (!mbr)
482    return 0;
483
484  // Now, load the bitcode module to get the Module.
485  std::string FullMemberName = archPath.str() + "(" +
486    mbr->getPath().str() + ")";
487  MemoryBuffer *Buffer =
488    MemoryBuffer::getMemBufferCopy(StringRef(mbr->getData(), mbr->getSize()),
489                                   FullMemberName.c_str());
490
491  Module *m = getLazyBitcodeModule(Buffer, Context, ErrMsg);
492  if (!m)
493    return 0;
494
495  modules.insert(std::make_pair(fileOffset, std::make_pair(m, mbr)));
496
497  return m;
498}
499
500// Look up multiple symbols in the symbol table and return a set of
501// Modules that define those symbols.
502bool
503Archive::findModulesDefiningSymbols(std::set<std::string>& symbols,
504                                    SmallVectorImpl<Module*>& result,
505                                    std::string* error) {
506  if (!mapfile || !base) {
507    if (error)
508      *error = "Empty archive invalid for finding modules defining symbols";
509    return false;
510  }
511
512  if (symTab.empty()) {
513    // We don't have a symbol table, so we must build it now but lets also
514    // make sure that we populate the modules table as we do this to ensure
515    // that we don't load them twice when findModuleDefiningSymbol is called
516    // below.
517
518    // Get a pointer to the first file
519    const char* At  = base + firstFileOffset;
520    const char* End = mapfile->getBufferEnd();
521
522    while ( At < End) {
523      // Compute the offset to be put in the symbol table
524      unsigned offset = At - base - firstFileOffset;
525
526      // Parse the file's header
527      ArchiveMember* mbr = parseMemberHeader(At, End, error);
528      if (!mbr)
529        return false;
530
531      // If it contains symbols
532      if (mbr->isBitcode()) {
533        // Get the symbols
534        std::vector<std::string> symbols;
535        std::string FullMemberName = archPath.str() + "(" +
536          mbr->getPath().str() + ")";
537        Module* M =
538          GetBitcodeSymbols(At, mbr->getSize(), FullMemberName, Context,
539                            symbols, error);
540
541        if (M) {
542          // Insert the module's symbols into the symbol table
543          for (std::vector<std::string>::iterator I = symbols.begin(),
544               E=symbols.end(); I != E; ++I ) {
545            symTab.insert(std::make_pair(*I, offset));
546          }
547          // Insert the Module and the ArchiveMember into the table of
548          // modules.
549          modules.insert(std::make_pair(offset, std::make_pair(M, mbr)));
550        } else {
551          if (error)
552            *error = "Can't parse bitcode member: " +
553              mbr->getPath().str() + ": " + *error;
554          delete mbr;
555          return false;
556        }
557      }
558
559      // Go to the next file location
560      At += mbr->getSize();
561      if ((intptr_t(At) & 1) == 1)
562        At++;
563    }
564  }
565
566  // At this point we have a valid symbol table (one way or another) so we
567  // just use it to quickly find the symbols requested.
568
569  SmallPtrSet<Module*, 16> Added;
570  for (std::set<std::string>::iterator I=symbols.begin(),
571         Next = I,
572         E=symbols.end(); I != E; I = Next) {
573    // Increment Next before we invalidate it.
574    ++Next;
575
576    // See if this symbol exists
577    Module* m = findModuleDefiningSymbol(*I,error);
578    if (!m)
579      continue;
580    bool NewMember = Added.insert(m);
581    if (!NewMember)
582      continue;
583
584    // The symbol exists, insert the Module into our result.
585    result.push_back(m);
586
587    // Remove the symbol now that its been resolved.
588    symbols.erase(I);
589  }
590  return true;
591}
592
593bool Archive::isBitcodeArchive() {
594  // Make sure the symTab has been loaded. In most cases this should have been
595  // done when the archive was constructed, but still,  this is just in case.
596  if (symTab.empty())
597    if (!loadSymbolTable(0))
598      return false;
599
600  // Now that we know it's been loaded, return true
601  // if it has a size
602  if (symTab.size()) return true;
603
604  // We still can't be sure it isn't a bitcode archive
605  if (!loadArchive(0))
606    return false;
607
608  std::vector<Module *> Modules;
609  std::string ErrorMessage;
610
611  // Scan the archive, trying to load a bitcode member.  We only load one to
612  // see if this works.
613  for (iterator I = begin(), E = end(); I != E; ++I) {
614    if (!I->isBitcode())
615      continue;
616
617    std::string FullMemberName =
618      archPath.str() + "(" + I->getPath().str() + ")";
619
620    MemoryBuffer *Buffer =
621      MemoryBuffer::getMemBufferCopy(StringRef(I->getData(), I->getSize()),
622                                     FullMemberName.c_str());
623    Module *M = ParseBitcodeFile(Buffer, Context);
624    delete Buffer;
625    if (!M)
626      return false;  // Couldn't parse bitcode, not a bitcode archive.
627    delete M;
628    return true;
629  }
630
631  return false;
632}
633