1#!/usr/bin/env python
2#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
3#
4#                     The LLVM Compiler Infrastructure
5#
6# This file is distributed under the University of Illinois Open Source
7# License. See LICENSE.TXT for details.
8#
9#===------------------------------------------------------------------------===#
10import argparse
11import bisect
12import getopt
13import os
14import re
15import subprocess
16import sys
17
18symbolizers = {}
19DEBUG = False
20demangle = False
21binutils_prefix = None
22sysroot_path = None
23binary_name_filter = None
24fix_filename_patterns = None
25logfile = sys.stdin
26allow_system_symbolizer = True
27force_system_symbolizer = False
28
29# FIXME: merge the code that calls fix_filename().
30def fix_filename(file_name):
31  if fix_filename_patterns:
32    for path_to_cut in fix_filename_patterns:
33      file_name = re.sub('.*' + path_to_cut, '', file_name)
34  file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
35  file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
36  return file_name
37
38def sysroot_path_filter(binary_name):
39  return sysroot_path + binary_name
40
41def is_valid_arch(s):
42  return s in ["i386", "x86_64", "x86_64h", "arm", "armv6", "armv7", "armv7s",
43               "armv7k", "arm64", "powerpc64", "powerpc64le", "s390x", "s390"]
44
45def guess_arch(addr):
46  # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
47  if len(addr) > 10:
48    return 'x86_64'
49  else:
50    return 'i386'
51
52class Symbolizer(object):
53  def __init__(self):
54    pass
55
56  def symbolize(self, addr, binary, offset):
57    """Symbolize the given address (pair of binary and offset).
58
59    Overriden in subclasses.
60    Args:
61        addr: virtual address of an instruction.
62        binary: path to executable/shared object containing this instruction.
63        offset: instruction offset in the @binary.
64    Returns:
65        list of strings (one string for each inlined frame) describing
66        the code locations for this instruction (that is, function name, file
67        name, line and column numbers).
68    """
69    return None
70
71
72class LLVMSymbolizer(Symbolizer):
73  def __init__(self, symbolizer_path, default_arch, system, dsym_hints=[]):
74    super(LLVMSymbolizer, self).__init__()
75    self.symbolizer_path = symbolizer_path
76    self.default_arch = default_arch
77    self.system = system
78    self.dsym_hints = dsym_hints
79    self.pipe = self.open_llvm_symbolizer()
80
81  def open_llvm_symbolizer(self):
82    cmd = [self.symbolizer_path,
83           '--use-symbol-table=true',
84           '--demangle=%s' % demangle,
85           '--functions=linkage',
86           '--inlining=true',
87           '--default-arch=%s' % self.default_arch]
88    if self.system == 'Darwin':
89      for hint in self.dsym_hints:
90        cmd.append('--dsym-hint=%s' % hint)
91    if DEBUG:
92      print(' '.join(cmd))
93    try:
94      result = subprocess.Popen(cmd, stdin=subprocess.PIPE,
95                                stdout=subprocess.PIPE,
96                                bufsize=0,
97                                universal_newlines=True)
98    except OSError:
99      result = None
100    return result
101
102  def symbolize(self, addr, binary, offset):
103    """Overrides Symbolizer.symbolize."""
104    if not self.pipe:
105      return None
106    result = []
107    try:
108      symbolizer_input = '"%s" %s' % (binary, offset)
109      if DEBUG:
110        print(symbolizer_input)
111      self.pipe.stdin.write("%s\n" % symbolizer_input)
112      while True:
113        function_name = self.pipe.stdout.readline().rstrip()
114        if not function_name:
115          break
116        file_name = self.pipe.stdout.readline().rstrip()
117        file_name = fix_filename(file_name)
118        if (not function_name.startswith('??') or
119            not file_name.startswith('??')):
120          # Append only non-trivial frames.
121          result.append('%s in %s %s' % (addr, function_name,
122                                         file_name))
123    except Exception:
124      result = []
125    if not result:
126      result = None
127    return result
128
129
130def LLVMSymbolizerFactory(system, default_arch, dsym_hints=[]):
131  symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
132  if not symbolizer_path:
133    symbolizer_path = os.getenv('ASAN_SYMBOLIZER_PATH')
134    if not symbolizer_path:
135      # Assume llvm-symbolizer is in PATH.
136      symbolizer_path = 'llvm-symbolizer'
137  return LLVMSymbolizer(symbolizer_path, default_arch, system, dsym_hints)
138
139
140class Addr2LineSymbolizer(Symbolizer):
141  def __init__(self, binary):
142    super(Addr2LineSymbolizer, self).__init__()
143    self.binary = binary
144    self.pipe = self.open_addr2line()
145    self.output_terminator = -1
146
147  def open_addr2line(self):
148    addr2line_tool = 'addr2line'
149    if binutils_prefix:
150      addr2line_tool = binutils_prefix + addr2line_tool
151    cmd = [addr2line_tool, '-fi']
152    if demangle:
153      cmd += ['--demangle']
154    cmd += ['-e', self.binary]
155    if DEBUG:
156      print(' '.join(cmd))
157    return subprocess.Popen(cmd,
158                            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
159                            bufsize=0,
160                            universal_newlines=True)
161
162  def symbolize(self, addr, binary, offset):
163    """Overrides Symbolizer.symbolize."""
164    if self.binary != binary:
165      return None
166    lines = []
167    try:
168      self.pipe.stdin.write("%s\n" % offset)
169      self.pipe.stdin.write("%s\n" % self.output_terminator)
170      is_first_frame = True
171      while True:
172        function_name = self.pipe.stdout.readline().rstrip()
173        file_name = self.pipe.stdout.readline().rstrip()
174        if is_first_frame:
175          is_first_frame = False
176        elif function_name in ['', '??']:
177          assert file_name == function_name
178          break
179        lines.append((function_name, file_name));
180    except Exception:
181      lines.append(('??', '??:0'))
182    return ['%s in %s %s' % (addr, function, fix_filename(file)) for (function, file) in lines]
183
184class UnbufferedLineConverter(object):
185  """
186  Wrap a child process that responds to each line of input with one line of
187  output.  Uses pty to trick the child into providing unbuffered output.
188  """
189  def __init__(self, args, close_stderr=False):
190    # Local imports so that the script can start on Windows.
191    import pty
192    import termios
193    pid, fd = pty.fork()
194    if pid == 0:
195      # We're the child. Transfer control to command.
196      if close_stderr:
197        dev_null = os.open('/dev/null', 0)
198        os.dup2(dev_null, 2)
199      os.execvp(args[0], args)
200    else:
201      # Disable echoing.
202      attr = termios.tcgetattr(fd)
203      attr[3] = attr[3] & ~termios.ECHO
204      termios.tcsetattr(fd, termios.TCSANOW, attr)
205      # Set up a file()-like interface to the child process
206      self.r = os.fdopen(fd, "r", 1)
207      self.w = os.fdopen(os.dup(fd), "w", 1)
208
209  def convert(self, line):
210    self.w.write(line + "\n")
211    return self.readline()
212
213  def readline(self):
214    return self.r.readline().rstrip()
215
216
217class DarwinSymbolizer(Symbolizer):
218  def __init__(self, addr, binary, arch):
219    super(DarwinSymbolizer, self).__init__()
220    self.binary = binary
221    self.arch = arch
222    self.open_atos()
223
224  def open_atos(self):
225    if DEBUG:
226      print('atos -o %s -arch %s' % (self.binary, self.arch))
227    cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
228    self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
229
230  def symbolize(self, addr, binary, offset):
231    """Overrides Symbolizer.symbolize."""
232    if self.binary != binary:
233      return None
234    if not os.path.exists(binary):
235      # If the binary doesn't exist atos will exit which will lead to IOError
236      # exceptions being raised later on so just don't try to symbolize.
237      return ['{} ({}:{}+{})'.format(addr, binary, self.arch, offset)]
238    atos_line = self.atos.convert('0x%x' % int(offset, 16))
239    while "got symbolicator for" in atos_line:
240      atos_line = self.atos.readline()
241    # A well-formed atos response looks like this:
242    #   foo(type1, type2) (in object.name) (filename.cc:80)
243    match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
244    if DEBUG:
245      print('atos_line: ', atos_line)
246    if match:
247      function_name = match.group(1)
248      function_name = re.sub('\(.*?\)', '', function_name)
249      file_name = fix_filename(match.group(3))
250      return ['%s in %s %s' % (addr, function_name, file_name)]
251    else:
252      return ['%s in %s' % (addr, atos_line)]
253
254
255# Chain several symbolizers so that if one symbolizer fails, we fall back
256# to the next symbolizer in chain.
257class ChainSymbolizer(Symbolizer):
258  def __init__(self, symbolizer_list):
259    super(ChainSymbolizer, self).__init__()
260    self.symbolizer_list = symbolizer_list
261
262  def symbolize(self, addr, binary, offset):
263    """Overrides Symbolizer.symbolize."""
264    for symbolizer in self.symbolizer_list:
265      if symbolizer:
266        result = symbolizer.symbolize(addr, binary, offset)
267        if result:
268          return result
269    return None
270
271  def append_symbolizer(self, symbolizer):
272    self.symbolizer_list.append(symbolizer)
273
274
275def BreakpadSymbolizerFactory(binary):
276  suffix = os.getenv('BREAKPAD_SUFFIX')
277  if suffix:
278    filename = binary + suffix
279    if os.access(filename, os.F_OK):
280      return BreakpadSymbolizer(filename)
281  return None
282
283
284def SystemSymbolizerFactory(system, addr, binary, arch):
285  if system == 'Darwin':
286    return DarwinSymbolizer(addr, binary, arch)
287  elif system in ['Linux', 'FreeBSD', 'NetBSD', 'SunOS']:
288    return Addr2LineSymbolizer(binary)
289
290
291class BreakpadSymbolizer(Symbolizer):
292  def __init__(self, filename):
293    super(BreakpadSymbolizer, self).__init__()
294    self.filename = filename
295    lines = file(filename).readlines()
296    self.files = []
297    self.symbols = {}
298    self.address_list = []
299    self.addresses = {}
300    # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
301    fragments = lines[0].rstrip().split()
302    self.arch = fragments[2]
303    self.debug_id = fragments[3]
304    self.binary = ' '.join(fragments[4:])
305    self.parse_lines(lines[1:])
306
307  def parse_lines(self, lines):
308    cur_function_addr = ''
309    for line in lines:
310      fragments = line.split()
311      if fragments[0] == 'FILE':
312        assert int(fragments[1]) == len(self.files)
313        self.files.append(' '.join(fragments[2:]))
314      elif fragments[0] == 'PUBLIC':
315        self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
316      elif fragments[0] in ['CFI', 'STACK']:
317        pass
318      elif fragments[0] == 'FUNC':
319        cur_function_addr = int(fragments[1], 16)
320        if not cur_function_addr in self.symbols.keys():
321          self.symbols[cur_function_addr] = ' '.join(fragments[4:])
322      else:
323        # Line starting with an address.
324        addr = int(fragments[0], 16)
325        self.address_list.append(addr)
326        # Tuple of symbol address, size, line, file number.
327        self.addresses[addr] = (cur_function_addr,
328                                int(fragments[1], 16),
329                                int(fragments[2]),
330                                int(fragments[3]))
331    self.address_list.sort()
332
333  def get_sym_file_line(self, addr):
334    key = None
335    if addr in self.addresses.keys():
336      key = addr
337    else:
338      index = bisect.bisect_left(self.address_list, addr)
339      if index == 0:
340        return None
341      else:
342        key = self.address_list[index - 1]
343    sym_id, size, line_no, file_no = self.addresses[key]
344    symbol = self.symbols[sym_id]
345    filename = self.files[file_no]
346    if addr < key + size:
347      return symbol, filename, line_no
348    else:
349      return None
350
351  def symbolize(self, addr, binary, offset):
352    if self.binary != binary:
353      return None
354    res = self.get_sym_file_line(int(offset, 16))
355    if res:
356      function_name, file_name, line_no = res
357      result = ['%s in %s %s:%d' % (
358          addr, function_name, file_name, line_no)]
359      print(result)
360      return result
361    else:
362      return None
363
364
365class SymbolizationLoop(object):
366  def __init__(self, binary_name_filter=None, dsym_hint_producer=None):
367    if sys.platform == 'win32':
368      # ASan on Windows uses dbghelp.dll to symbolize in-process, which works
369      # even in sandboxed processes.  Nothing needs to be done here.
370      self.process_line = self.process_line_echo
371    else:
372      # Used by clients who may want to supply a different binary name.
373      # E.g. in Chrome several binaries may share a single .dSYM.
374      self.binary_name_filter = binary_name_filter
375      self.dsym_hint_producer = dsym_hint_producer
376      self.system = os.uname()[0]
377      if self.system not in ['Linux', 'Darwin', 'FreeBSD', 'NetBSD','SunOS']:
378        raise Exception('Unknown system')
379      self.llvm_symbolizers = {}
380      self.last_llvm_symbolizer = None
381      self.dsym_hints = set([])
382      self.frame_no = 0
383      self.process_line = self.process_line_posix
384
385  def symbolize_address(self, addr, binary, offset, arch):
386    # On non-Darwin (i.e. on platforms without .dSYM debug info) always use
387    # a single symbolizer binary.
388    # On Darwin, if the dsym hint producer is present:
389    #  1. check whether we've seen this binary already; if so,
390    #     use |llvm_symbolizers[binary]|, which has already loaded the debug
391    #     info for this binary (might not be the case for
392    #     |last_llvm_symbolizer|);
393    #  2. otherwise check if we've seen all the hints for this binary already;
394    #     if so, reuse |last_llvm_symbolizer| which has the full set of hints;
395    #  3. otherwise create a new symbolizer and pass all currently known
396    #     .dSYM hints to it.
397    result = None
398    if not force_system_symbolizer:
399      if not binary in self.llvm_symbolizers:
400        use_new_symbolizer = True
401        if self.system == 'Darwin' and self.dsym_hint_producer:
402          dsym_hints_for_binary = set(self.dsym_hint_producer(binary))
403          use_new_symbolizer = bool(dsym_hints_for_binary - self.dsym_hints)
404          self.dsym_hints |= dsym_hints_for_binary
405        if self.last_llvm_symbolizer and not use_new_symbolizer:
406            self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
407        else:
408          self.last_llvm_symbolizer = LLVMSymbolizerFactory(
409              self.system, arch, self.dsym_hints)
410          self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
411      # Use the chain of symbolizers:
412      # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
413      # (fall back to next symbolizer if the previous one fails).
414      if not binary in symbolizers:
415        symbolizers[binary] = ChainSymbolizer(
416            [BreakpadSymbolizerFactory(binary), self.llvm_symbolizers[binary]])
417      result = symbolizers[binary].symbolize(addr, binary, offset)
418    else:
419      symbolizers[binary] = ChainSymbolizer([])
420    if result is None:
421      if not allow_system_symbolizer:
422        raise Exception('Failed to launch or use llvm-symbolizer.')
423      # Initialize system symbolizer only if other symbolizers failed.
424      symbolizers[binary].append_symbolizer(
425          SystemSymbolizerFactory(self.system, addr, binary, arch))
426      result = symbolizers[binary].symbolize(addr, binary, offset)
427    # The system symbolizer must produce some result.
428    assert result
429    return result
430
431  def get_symbolized_lines(self, symbolized_lines):
432    if not symbolized_lines:
433      return [self.current_line]
434    else:
435      result = []
436      for symbolized_frame in symbolized_lines:
437        result.append('    #%s %s' % (str(self.frame_no), symbolized_frame.rstrip()))
438        self.frame_no += 1
439      return result
440
441  def process_logfile(self):
442    self.frame_no = 0
443    for line in logfile:
444      processed = self.process_line(line)
445      print('\n'.join(processed))
446
447  def process_line_echo(self, line):
448    return [line.rstrip()]
449
450  def process_line_posix(self, line):
451    self.current_line = line.rstrip()
452    #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
453    stack_trace_line_format = (
454        '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
455    match = re.match(stack_trace_line_format, line)
456    if not match:
457      return [self.current_line]
458    if DEBUG:
459      print(line)
460    _, frameno_str, addr, binary, offset = match.groups()
461    arch = ""
462    # Arch can be embedded in the filename, e.g.: "libabc.dylib:x86_64h"
463    colon_pos = binary.rfind(":")
464    if colon_pos != -1:
465      maybe_arch = binary[colon_pos+1:]
466      if is_valid_arch(maybe_arch):
467        arch = maybe_arch
468        binary = binary[0:colon_pos]
469    if arch == "":
470      arch = guess_arch(addr)
471    if frameno_str == '0':
472      # Assume that frame #0 is the first frame of new stack trace.
473      self.frame_no = 0
474    original_binary = binary
475    if self.binary_name_filter:
476      binary = self.binary_name_filter(binary)
477    symbolized_line = self.symbolize_address(addr, binary, offset, arch)
478    if not symbolized_line:
479      if original_binary != binary:
480        symbolized_line = self.symbolize_address(addr, original_binary, offset, arch)
481    return self.get_symbolized_lines(symbolized_line)
482
483
484if __name__ == '__main__':
485  parser = argparse.ArgumentParser(
486      formatter_class=argparse.RawDescriptionHelpFormatter,
487      description='ASan symbolization script',
488      epilog='Example of use:\n'
489             'asan_symbolize.py -c "$HOME/opt/cross/bin/arm-linux-gnueabi-" '
490             '-s "$HOME/SymbolFiles" < asan.log')
491  parser.add_argument('path_to_cut', nargs='*',
492                      help='pattern to be cut from the result file path ')
493  parser.add_argument('-d','--demangle', action='store_true',
494                      help='demangle function names')
495  parser.add_argument('-s', metavar='SYSROOT',
496                      help='set path to sysroot for sanitized binaries')
497  parser.add_argument('-c', metavar='CROSS_COMPILE',
498                      help='set prefix for binutils')
499  parser.add_argument('-l','--logfile', default=sys.stdin,
500                      type=argparse.FileType('r'),
501                      help='set log file name to parse, default is stdin')
502  parser.add_argument('--force-system-symbolizer', action='store_true',
503                      help='don\'t use llvm-symbolizer')
504  args = parser.parse_args()
505  if args.path_to_cut:
506    fix_filename_patterns = args.path_to_cut
507  if args.demangle:
508    demangle = True
509  if args.s:
510    binary_name_filter = sysroot_path_filter
511    sysroot_path = args.s
512  if args.c:
513    binutils_prefix = args.c
514  if args.logfile:
515    logfile = args.logfile
516  else:
517    logfile = sys.stdin
518  if args.force_system_symbolizer:
519    force_system_symbolizer = True
520  if force_system_symbolizer:
521    assert(allow_system_symbolizer)
522  loop = SymbolizationLoop(binary_name_filter)
523  loop.process_logfile()
524