xref: /openbsd-src/gnu/llvm/compiler-rt/lib/asan/scripts/asan_symbolize.py (revision 1a8dbaac879b9f3335ad7fb25429ce63ac1d6bac)
1#!/usr/bin/env python
2#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
3#
4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5# See https://llvm.org/LICENSE.txt for license information.
6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7#
8#===------------------------------------------------------------------------===#
9"""
10Example of use:
11  asan_symbolize.py -c "$HOME/opt/cross/bin/arm-linux-gnueabi-" -s "$HOME/SymbolFiles" < asan.log
12
13PLUGINS
14
15This script provides a way for external plug-ins to hook into the behaviour of
16various parts of this script (see `--plugins`). This is useful for situations
17where it is necessary to handle site-specific quirks (e.g. binaries with debug
18symbols only accessible via a remote service) without having to modify the
19script itself.
20
21"""
22import argparse
23import bisect
24import errno
25import getopt
26import logging
27import os
28import re
29import subprocess
30import sys
31from distutils.spawn import find_executable
32
33symbolizers = {}
34demangle = False
35binutils_prefix = None
36fix_filename_patterns = None
37logfile = sys.stdin
38allow_system_symbolizer = True
39force_system_symbolizer = False
40
41# FIXME: merge the code that calls fix_filename().
42def fix_filename(file_name):
43  if fix_filename_patterns:
44    for path_to_cut in fix_filename_patterns:
45      file_name = re.sub('.*' + path_to_cut, '', file_name)
46  file_name = re.sub('.*asan_[a-z_]*.(cc|cpp):[0-9]*', '_asan_rtl_', file_name)
47  file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
48  return file_name
49
50def is_valid_arch(s):
51  return s in ["i386", "x86_64", "x86_64h", "arm", "armv6", "armv7", "armv7s",
52               "armv7k", "arm64", "powerpc64", "powerpc64le", "s390x", "s390"]
53
54def guess_arch(addr):
55  # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
56  if len(addr) > 10:
57    return 'x86_64'
58  else:
59    return 'i386'
60
61class Symbolizer(object):
62  def __init__(self):
63    pass
64
65  def symbolize(self, addr, binary, offset):
66    """Symbolize the given address (pair of binary and offset).
67
68    Overriden in subclasses.
69    Args:
70        addr: virtual address of an instruction.
71        binary: path to executable/shared object containing this instruction.
72        offset: instruction offset in the @binary.
73    Returns:
74        list of strings (one string for each inlined frame) describing
75        the code locations for this instruction (that is, function name, file
76        name, line and column numbers).
77    """
78    return None
79
80
81class LLVMSymbolizer(Symbolizer):
82  def __init__(self, symbolizer_path, default_arch, system, dsym_hints=[]):
83    super(LLVMSymbolizer, self).__init__()
84    self.symbolizer_path = symbolizer_path
85    self.default_arch = default_arch
86    self.system = system
87    self.dsym_hints = dsym_hints
88    self.pipe = self.open_llvm_symbolizer()
89
90  def open_llvm_symbolizer(self):
91    cmd = [self.symbolizer_path,
92           '--use-symbol-table=true',
93           '--demangle=%s' % demangle,
94           '--functions=linkage',
95           '--inlining=true',
96           '--default-arch=%s' % self.default_arch]
97    if self.system == 'Darwin':
98      for hint in self.dsym_hints:
99        cmd.append('--dsym-hint=%s' % hint)
100    logging.debug(' '.join(cmd))
101    try:
102      result = subprocess.Popen(cmd, stdin=subprocess.PIPE,
103                                stdout=subprocess.PIPE,
104                                bufsize=0,
105                                universal_newlines=True)
106    except OSError:
107      result = None
108    return result
109
110  def symbolize(self, addr, binary, offset):
111    """Overrides Symbolizer.symbolize."""
112    if not self.pipe:
113      return None
114    result = []
115    try:
116      symbolizer_input = '"%s" %s' % (binary, offset)
117      logging.debug(symbolizer_input)
118      self.pipe.stdin.write("%s\n" % symbolizer_input)
119      while True:
120        function_name = self.pipe.stdout.readline().rstrip()
121        if not function_name:
122          break
123        file_name = self.pipe.stdout.readline().rstrip()
124        file_name = fix_filename(file_name)
125        if (not function_name.startswith('??') or
126            not file_name.startswith('??')):
127          # Append only non-trivial frames.
128          result.append('%s in %s %s' % (addr, function_name,
129                                         file_name))
130    except Exception:
131      result = []
132    if not result:
133      result = None
134    return result
135
136
137def LLVMSymbolizerFactory(system, default_arch, dsym_hints=[]):
138  symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
139  if not symbolizer_path:
140    symbolizer_path = os.getenv('ASAN_SYMBOLIZER_PATH')
141    if not symbolizer_path:
142      # Assume llvm-symbolizer is in PATH.
143      symbolizer_path = 'llvm-symbolizer'
144  return LLVMSymbolizer(symbolizer_path, default_arch, system, dsym_hints)
145
146
147class Addr2LineSymbolizer(Symbolizer):
148  def __init__(self, binary):
149    super(Addr2LineSymbolizer, self).__init__()
150    self.binary = binary
151    self.pipe = self.open_addr2line()
152    self.output_terminator = -1
153
154  def open_addr2line(self):
155    addr2line_tool = 'addr2line'
156    if binutils_prefix:
157      addr2line_tool = binutils_prefix + addr2line_tool
158    logging.debug('addr2line binary is %s' % find_executable(addr2line_tool))
159    cmd = [addr2line_tool, '-fi']
160    if demangle:
161      cmd += ['--demangle']
162    cmd += ['-e', self.binary]
163    logging.debug(' '.join(cmd))
164    return subprocess.Popen(cmd,
165                            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
166                            bufsize=0,
167                            universal_newlines=True)
168
169  def symbolize(self, addr, binary, offset):
170    """Overrides Symbolizer.symbolize."""
171    if self.binary != binary:
172      return None
173    lines = []
174    try:
175      self.pipe.stdin.write("%s\n" % offset)
176      self.pipe.stdin.write("%s\n" % self.output_terminator)
177      is_first_frame = True
178      while True:
179        function_name = self.pipe.stdout.readline().rstrip()
180        logging.debug("read function_name='%s' from addr2line" % function_name)
181        # If llvm-symbolizer is installed as addr2line, older versions of
182        # llvm-symbolizer will print -1 when presented with -1 and not print
183        # a second line. In that case we will block for ever trying to read the
184        # file name. This also happens for non-existent files, in which case GNU
185        # addr2line exits immediate, but llvm-symbolizer does not (see
186        # https://llvm.org/PR42754).
187        if function_name == '-1':
188          logging.debug("got function '-1' -> no more input")
189          break
190        file_name = self.pipe.stdout.readline().rstrip()
191        logging.debug("read file_name='%s' from addr2line" % file_name)
192        if is_first_frame:
193          is_first_frame = False
194        elif function_name == '??':
195          assert file_name == '??:0', file_name
196          logging.debug("got function '??' -> no more input")
197          break
198        elif not function_name:
199          assert not file_name, file_name
200          logging.debug("got empty function name -> no more input")
201          break
202        if not function_name and not file_name:
203          logging.debug("got empty function and file name -> unknown function")
204          function_name = '??'
205          file_name = '??:0'
206        lines.append((function_name, file_name))
207    except IOError as e:
208      # EPIPE happens if addr2line exits early (which some implementations do
209      # if an invalid file is passed).
210      if e.errno == errno.EPIPE:
211        logging.debug("addr2line exited early (broken pipe), returncode=%d" % self.pipe.poll())
212      else:
213        logging.debug("unexpected I/O exception communicating with addr2line", exc_info=e)
214      lines.append(('??', '??:0'))
215    except Exception as e:
216      logging.debug("got unknown exception communicating with addr2line", exc_info=e)
217      lines.append(('??', '??:0'))
218    return ['%s in %s %s' % (addr, function, fix_filename(file)) for (function, file) in lines]
219
220class UnbufferedLineConverter(object):
221  """
222  Wrap a child process that responds to each line of input with one line of
223  output.  Uses pty to trick the child into providing unbuffered output.
224  """
225  def __init__(self, args, close_stderr=False):
226    # Local imports so that the script can start on Windows.
227    import pty
228    import termios
229    pid, fd = pty.fork()
230    if pid == 0:
231      # We're the child. Transfer control to command.
232      if close_stderr:
233        dev_null = os.open('/dev/null', 0)
234        os.dup2(dev_null, 2)
235      os.execvp(args[0], args)
236    else:
237      # Disable echoing.
238      attr = termios.tcgetattr(fd)
239      attr[3] = attr[3] & ~termios.ECHO
240      termios.tcsetattr(fd, termios.TCSANOW, attr)
241      # Set up a file()-like interface to the child process
242      self.r = os.fdopen(fd, "r", 1)
243      self.w = os.fdopen(os.dup(fd), "w", 1)
244
245  def convert(self, line):
246    self.w.write(line + "\n")
247    return self.readline()
248
249  def readline(self):
250    return self.r.readline().rstrip()
251
252
253class DarwinSymbolizer(Symbolizer):
254  def __init__(self, addr, binary, arch):
255    super(DarwinSymbolizer, self).__init__()
256    self.binary = binary
257    self.arch = arch
258    self.open_atos()
259
260  def open_atos(self):
261    logging.debug('atos -o %s -arch %s', self.binary, self.arch)
262    cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
263    self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
264
265  def symbolize(self, addr, binary, offset):
266    """Overrides Symbolizer.symbolize."""
267    if self.binary != binary:
268      return None
269    if not os.path.exists(binary):
270      # If the binary doesn't exist atos will exit which will lead to IOError
271      # exceptions being raised later on so just don't try to symbolize.
272      return ['{} ({}:{}+{})'.format(addr, binary, self.arch, offset)]
273    atos_line = self.atos.convert('0x%x' % int(offset, 16))
274    while "got symbolicator for" in atos_line:
275      atos_line = self.atos.readline()
276    # A well-formed atos response looks like this:
277    #   foo(type1, type2) (in object.name) (filename.cc:80)
278    match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
279    logging.debug('atos_line: %s', atos_line)
280    if match:
281      function_name = match.group(1)
282      function_name = re.sub('\(.*?\)', '', function_name)
283      file_name = fix_filename(match.group(3))
284      return ['%s in %s %s' % (addr, function_name, file_name)]
285    else:
286      return ['%s in %s' % (addr, atos_line)]
287
288
289# Chain several symbolizers so that if one symbolizer fails, we fall back
290# to the next symbolizer in chain.
291class ChainSymbolizer(Symbolizer):
292  def __init__(self, symbolizer_list):
293    super(ChainSymbolizer, self).__init__()
294    self.symbolizer_list = symbolizer_list
295
296  def symbolize(self, addr, binary, offset):
297    """Overrides Symbolizer.symbolize."""
298    for symbolizer in self.symbolizer_list:
299      if symbolizer:
300        result = symbolizer.symbolize(addr, binary, offset)
301        if result:
302          return result
303    return None
304
305  def append_symbolizer(self, symbolizer):
306    self.symbolizer_list.append(symbolizer)
307
308
309def BreakpadSymbolizerFactory(binary):
310  suffix = os.getenv('BREAKPAD_SUFFIX')
311  if suffix:
312    filename = binary + suffix
313    if os.access(filename, os.F_OK):
314      return BreakpadSymbolizer(filename)
315  return None
316
317
318def SystemSymbolizerFactory(system, addr, binary, arch):
319  if system == 'Darwin':
320    return DarwinSymbolizer(addr, binary, arch)
321  elif system in ['Linux', 'FreeBSD', 'NetBSD', 'SunOS']:
322    return Addr2LineSymbolizer(binary)
323
324
325class BreakpadSymbolizer(Symbolizer):
326  def __init__(self, filename):
327    super(BreakpadSymbolizer, self).__init__()
328    self.filename = filename
329    lines = file(filename).readlines()
330    self.files = []
331    self.symbols = {}
332    self.address_list = []
333    self.addresses = {}
334    # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
335    fragments = lines[0].rstrip().split()
336    self.arch = fragments[2]
337    self.debug_id = fragments[3]
338    self.binary = ' '.join(fragments[4:])
339    self.parse_lines(lines[1:])
340
341  def parse_lines(self, lines):
342    cur_function_addr = ''
343    for line in lines:
344      fragments = line.split()
345      if fragments[0] == 'FILE':
346        assert int(fragments[1]) == len(self.files)
347        self.files.append(' '.join(fragments[2:]))
348      elif fragments[0] == 'PUBLIC':
349        self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
350      elif fragments[0] in ['CFI', 'STACK']:
351        pass
352      elif fragments[0] == 'FUNC':
353        cur_function_addr = int(fragments[1], 16)
354        if not cur_function_addr in self.symbols.keys():
355          self.symbols[cur_function_addr] = ' '.join(fragments[4:])
356      else:
357        # Line starting with an address.
358        addr = int(fragments[0], 16)
359        self.address_list.append(addr)
360        # Tuple of symbol address, size, line, file number.
361        self.addresses[addr] = (cur_function_addr,
362                                int(fragments[1], 16),
363                                int(fragments[2]),
364                                int(fragments[3]))
365    self.address_list.sort()
366
367  def get_sym_file_line(self, addr):
368    key = None
369    if addr in self.addresses.keys():
370      key = addr
371    else:
372      index = bisect.bisect_left(self.address_list, addr)
373      if index == 0:
374        return None
375      else:
376        key = self.address_list[index - 1]
377    sym_id, size, line_no, file_no = self.addresses[key]
378    symbol = self.symbols[sym_id]
379    filename = self.files[file_no]
380    if addr < key + size:
381      return symbol, filename, line_no
382    else:
383      return None
384
385  def symbolize(self, addr, binary, offset):
386    if self.binary != binary:
387      return None
388    res = self.get_sym_file_line(int(offset, 16))
389    if res:
390      function_name, file_name, line_no = res
391      result = ['%s in %s %s:%d' % (
392          addr, function_name, file_name, line_no)]
393      print(result)
394      return result
395    else:
396      return None
397
398
399class SymbolizationLoop(object):
400  def __init__(self, plugin_proxy=None, dsym_hint_producer=None):
401    self.plugin_proxy = plugin_proxy
402    if sys.platform == 'win32':
403      # ASan on Windows uses dbghelp.dll to symbolize in-process, which works
404      # even in sandboxed processes.  Nothing needs to be done here.
405      self.process_line = self.process_line_echo
406    else:
407      # Used by clients who may want to supply a different binary name.
408      # E.g. in Chrome several binaries may share a single .dSYM.
409      self.dsym_hint_producer = dsym_hint_producer
410      self.system = os.uname()[0]
411      if self.system not in ['Linux', 'Darwin', 'FreeBSD', 'NetBSD','SunOS']:
412        raise Exception('Unknown system')
413      self.llvm_symbolizers = {}
414      self.last_llvm_symbolizer = None
415      self.dsym_hints = set([])
416      self.frame_no = 0
417      self.process_line = self.process_line_posix
418      self.using_module_map = plugin_proxy.has_plugin(ModuleMapPlugIn.get_name())
419
420  def symbolize_address(self, addr, binary, offset, arch):
421    # On non-Darwin (i.e. on platforms without .dSYM debug info) always use
422    # a single symbolizer binary.
423    # On Darwin, if the dsym hint producer is present:
424    #  1. check whether we've seen this binary already; if so,
425    #     use |llvm_symbolizers[binary]|, which has already loaded the debug
426    #     info for this binary (might not be the case for
427    #     |last_llvm_symbolizer|);
428    #  2. otherwise check if we've seen all the hints for this binary already;
429    #     if so, reuse |last_llvm_symbolizer| which has the full set of hints;
430    #  3. otherwise create a new symbolizer and pass all currently known
431    #     .dSYM hints to it.
432    result = None
433    if not force_system_symbolizer:
434      if not binary in self.llvm_symbolizers:
435        use_new_symbolizer = True
436        if self.system == 'Darwin' and self.dsym_hint_producer:
437          dsym_hints_for_binary = set(self.dsym_hint_producer(binary))
438          use_new_symbolizer = bool(dsym_hints_for_binary - self.dsym_hints)
439          self.dsym_hints |= dsym_hints_for_binary
440        if self.last_llvm_symbolizer and not use_new_symbolizer:
441            self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
442        else:
443          self.last_llvm_symbolizer = LLVMSymbolizerFactory(
444              self.system, arch, self.dsym_hints)
445          self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
446      # Use the chain of symbolizers:
447      # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
448      # (fall back to next symbolizer if the previous one fails).
449      if not binary in symbolizers:
450        symbolizers[binary] = ChainSymbolizer(
451            [BreakpadSymbolizerFactory(binary), self.llvm_symbolizers[binary]])
452      result = symbolizers[binary].symbolize(addr, binary, offset)
453    else:
454      symbolizers[binary] = ChainSymbolizer([])
455    if result is None:
456      if not allow_system_symbolizer:
457        raise Exception('Failed to launch or use llvm-symbolizer.')
458      # Initialize system symbolizer only if other symbolizers failed.
459      symbolizers[binary].append_symbolizer(
460          SystemSymbolizerFactory(self.system, addr, binary, arch))
461      result = symbolizers[binary].symbolize(addr, binary, offset)
462    # The system symbolizer must produce some result.
463    assert result
464    return result
465
466  def get_symbolized_lines(self, symbolized_lines, inc_frame_counter=True):
467    if not symbolized_lines:
468      if inc_frame_counter:
469        self.frame_no += 1
470      return [self.current_line]
471    else:
472      assert inc_frame_counter
473      result = []
474      for symbolized_frame in symbolized_lines:
475        result.append('    #%s %s' % (str(self.frame_no), symbolized_frame.rstrip()))
476        self.frame_no += 1
477      return result
478
479  def process_logfile(self):
480    self.frame_no = 0
481    for line in logfile:
482      processed = self.process_line(line)
483      print('\n'.join(processed))
484
485  def process_line_echo(self, line):
486    return [line.rstrip()]
487
488  def process_line_posix(self, line):
489    self.current_line = line.rstrip()
490    # Unsymbolicated:
491    # #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
492    # Partially symbolicated:
493    # #0 0x7f6e35cf2e45 in foo (foo.so+0x11fe45)
494    # NOTE: We have to very liberal with symbol
495    # names in the regex because it could be an
496    # Objective-C or C++ demangled name.
497    stack_trace_line_format = (
498        '^( *#([0-9]+) *)(0x[0-9a-f]+) *(?:in *.+)? *\((.*)\+(0x[0-9a-f]+)\)')
499    match = re.match(stack_trace_line_format, line)
500    if not match:
501      logging.debug('Line "{}" does not match regex'.format(line))
502      # Not a frame line so don't increment the frame counter.
503      return self.get_symbolized_lines(None, inc_frame_counter=False)
504    logging.debug(line)
505    _, frameno_str, addr, binary, offset = match.groups()
506
507    if not self.using_module_map and not os.path.isabs(binary):
508      # Do not try to symbolicate if the binary is just the module file name
509      # and a module map is unavailable.
510      # FIXME(dliew): This is currently necessary for reports on Darwin that are
511      # partially symbolicated by `atos`.
512      return self.get_symbolized_lines(None)
513    arch = ""
514    # Arch can be embedded in the filename, e.g.: "libabc.dylib:x86_64h"
515    colon_pos = binary.rfind(":")
516    if colon_pos != -1:
517      maybe_arch = binary[colon_pos+1:]
518      if is_valid_arch(maybe_arch):
519        arch = maybe_arch
520        binary = binary[0:colon_pos]
521    if arch == "":
522      arch = guess_arch(addr)
523    if frameno_str == '0':
524      # Assume that frame #0 is the first frame of new stack trace.
525      self.frame_no = 0
526    original_binary = binary
527    binary = self.plugin_proxy.filter_binary_path(binary)
528    if binary is None:
529      # The binary filter has told us this binary can't be symbolized.
530      logging.debug('Skipping symbolication of binary "%s"', original_binary)
531      return self.get_symbolized_lines(None)
532    symbolized_line = self.symbolize_address(addr, binary, offset, arch)
533    if not symbolized_line:
534      if original_binary != binary:
535        symbolized_line = self.symbolize_address(addr, original_binary, offset, arch)
536    return self.get_symbolized_lines(symbolized_line)
537
538class AsanSymbolizerPlugInProxy(object):
539  """
540    Serves several purposes:
541    - Manages the lifetime of plugins (must be used a `with` statement).
542    - Provides interface for calling into plugins from within this script.
543  """
544  def __init__(self):
545    self._plugins = [ ]
546    self._plugin_names = set()
547
548  def _load_plugin_from_file_impl_py_gt_2(self, file_path, globals_space):
549      with open(file_path, 'r') as f:
550        exec(f.read(), globals_space, None)
551
552  def load_plugin_from_file(self, file_path):
553    logging.info('Loading plugins from "{}"'.format(file_path))
554    globals_space = dict(globals())
555    # Provide function to register plugins
556    def register_plugin(plugin):
557      logging.info('Registering plugin %s', plugin.get_name())
558      self.add_plugin(plugin)
559    globals_space['register_plugin'] = register_plugin
560    if sys.version_info.major < 3:
561      execfile(file_path, globals_space, None)
562    else:
563      # Indirection here is to avoid a bug in older Python 2 versions:
564      # `SyntaxError: unqualified exec is not allowed in function ...`
565      self._load_plugin_from_file_impl_py_gt_2(file_path, globals_space)
566
567  def add_plugin(self, plugin):
568    assert isinstance(plugin, AsanSymbolizerPlugIn)
569    self._plugins.append(plugin)
570    self._plugin_names.add(plugin.get_name())
571    plugin._receive_proxy(self)
572
573  def remove_plugin(self, plugin):
574    assert isinstance(plugin, AsanSymbolizerPlugIn)
575    self._plugins.remove(plugin)
576    self._plugin_names.remove(plugin.get_name())
577    logging.debug('Removing plugin %s', plugin.get_name())
578    plugin.destroy()
579
580  def has_plugin(self, name):
581    """
582      Returns true iff the plugin name is currently
583      being managed by AsanSymbolizerPlugInProxy.
584    """
585    return name in self._plugin_names
586
587  def register_cmdline_args(self, parser):
588    plugins = list(self._plugins)
589    for plugin in plugins:
590      plugin.register_cmdline_args(parser)
591
592  def process_cmdline_args(self, pargs):
593    # Use copy so we can remove items as we iterate.
594    plugins = list(self._plugins)
595    for plugin in plugins:
596      keep = plugin.process_cmdline_args(pargs)
597      assert isinstance(keep, bool)
598      if not keep:
599        self.remove_plugin(plugin)
600
601  def __enter__(self):
602    return self
603
604  def __exit__(self, exc_type, exc_val, exc_tb):
605    for plugin in self._plugins:
606      plugin.destroy()
607    # Don't suppress raised exceptions
608    return False
609
610  def _filter_single_value(self, function_name, input_value):
611    """
612      Helper for filter style plugin functions.
613    """
614    new_value = input_value
615    for plugin in self._plugins:
616      result = getattr(plugin, function_name)(new_value)
617      if result is None:
618        return None
619      new_value = result
620    return new_value
621
622  def filter_binary_path(self, binary_path):
623    """
624      Consult available plugins to filter the path to a binary
625      to make it suitable for symbolication.
626
627      Returns `None` if symbolication should not be attempted for this
628      binary.
629    """
630    return self._filter_single_value('filter_binary_path', binary_path)
631
632  def filter_module_desc(self, module_desc):
633    """
634      Consult available plugins to determine the module
635      description suitable for symbolication.
636
637      Returns `None` if symbolication should not be attempted for this module.
638    """
639    assert isinstance(module_desc, ModuleDesc)
640    return self._filter_single_value('filter_module_desc', module_desc)
641
642class AsanSymbolizerPlugIn(object):
643  """
644    This is the interface the `asan_symbolize.py` code uses to talk
645    to plugins.
646  """
647  @classmethod
648  def get_name(cls):
649    """
650      Returns the name of the plugin.
651    """
652    return cls.__name__
653
654  def _receive_proxy(self, proxy):
655    assert isinstance(proxy, AsanSymbolizerPlugInProxy)
656    self.proxy = proxy
657
658  def register_cmdline_args(self, parser):
659    """
660      Hook for registering command line arguments to be
661      consumed in `process_cmdline_args()`.
662
663      `parser` - Instance of `argparse.ArgumentParser`.
664    """
665    pass
666
667  def process_cmdline_args(self, pargs):
668    """
669      Hook for handling parsed arguments. Implementations
670      should not modify `pargs`.
671
672      `pargs` - Instance of `argparse.Namespace` containing
673      parsed command line arguments.
674
675      Return `True` if plug-in should be used, otherwise
676      return `False`.
677    """
678    return True
679
680  def destroy(self):
681    """
682      Hook called when a plugin is about to be destroyed.
683      Implementations should free any allocated resources here.
684    """
685    pass
686
687  # Symbolization hooks
688  def filter_binary_path(self, binary_path):
689    """
690      Given a binary path return a binary path suitable for symbolication.
691
692      Implementations should return `None` if symbolication of this binary
693      should be skipped.
694    """
695    return binary_path
696
697  def filter_module_desc(self, module_desc):
698    """
699      Given a ModuleDesc object (`module_desc`) return
700      a ModuleDesc suitable for symbolication.
701
702      Implementations should return `None` if symbolication of this binary
703      should be skipped.
704    """
705    return module_desc
706
707class ModuleDesc(object):
708  def __init__(self, name, arch, start_addr, end_addr, module_path, uuid):
709    self.name = name
710    self.arch = arch
711    self.start_addr = start_addr
712    self.end_addr = end_addr
713    # Module path from an ASan report.
714    self.module_path = module_path
715    # Module for performing symbolization, by default same as above.
716    self.module_path_for_symbolization = module_path
717    self.uuid = uuid
718    assert self.is_valid()
719
720  def __str__(self):
721    assert self.is_valid()
722    return "{name} {arch} {start_addr:#016x}-{end_addr:#016x} {module_path} {uuid}".format(
723      name=self.name,
724      arch=self.arch,
725      start_addr=self.start_addr,
726      end_addr=self.end_addr,
727      module_path=self.module_path if self.module_path == self.module_path_for_symbolization else '{} ({})'.format(self.module_path_for_symbolization, self.module_path),
728      uuid=self.uuid
729    )
730
731  def is_valid(self):
732    if not isinstance(self.name, str):
733      return False
734    if not isinstance(self.arch, str):
735      return False
736    if not isinstance(self.start_addr, int):
737      return False
738    if self.start_addr < 0:
739      return False
740    if not isinstance(self.end_addr, int):
741      return False
742    if self.end_addr <= self.start_addr:
743      return False
744    if not isinstance(self.module_path, str):
745      return False
746    if not os.path.isabs(self.module_path):
747      return False
748    if not isinstance(self.module_path_for_symbolization, str):
749      return False
750    if not os.path.isabs(self.module_path_for_symbolization):
751      return False
752    if not isinstance(self.uuid, str):
753      return False
754    return True
755
756class GetUUIDFromBinaryException(Exception):
757  def __init__(self, msg):
758    super(GetUUIDFromBinaryException, self).__init__(msg)
759
760_get_uuid_from_binary_cache = dict()
761
762def get_uuid_from_binary(path_to_binary, arch=None):
763  cache_key = (path_to_binary, arch)
764  cached_value = _get_uuid_from_binary_cache.get(cache_key)
765  if cached_value:
766    return cached_value
767  if not os.path.exists(path_to_binary):
768    raise GetUUIDFromBinaryException('Binary "{}" does not exist'.format(path_to_binary))
769  cmd = [ '/usr/bin/otool', '-l']
770  if arch:
771    cmd.extend(['-arch', arch])
772  cmd.append(path_to_binary)
773  output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
774  # Look for this output:
775  # cmd LC_UUID
776  # cmdsize 24
777  # uuid 4CA778FE-5BF9-3C45-AE59-7DF01B2BE83F
778  if isinstance(output, str):
779    output_str = output
780  else:
781    assert isinstance(output, bytes)
782    output_str = output.decode()
783  assert isinstance(output_str, str)
784  lines = output_str.split('\n')
785  uuid = None
786  for index, line in enumerate(lines):
787    stripped_line = line.strip()
788    if not stripped_line.startswith('cmd LC_UUID'):
789      continue
790    uuid_line = lines[index+2].strip()
791    if not uuid_line.startswith('uuid'):
792      raise GetUUIDFromBinaryException('Malformed output: "{}"'.format(uuid_line))
793    split_uuid_line = uuid_line.split()
794    uuid = split_uuid_line[1]
795    break
796  if uuid is None:
797    logging.error('Failed to retrieve UUID from binary {}'.format(path_to_binary))
798    logging.error('otool output was:\n{}'.format(output_str))
799    raise GetUUIDFromBinaryException('Failed to retrieve UUID from binary "{}"'.format(path_to_binary))
800  else:
801    # Update cache
802    _get_uuid_from_binary_cache[cache_key] = uuid
803  return uuid
804
805class ModuleMap(object):
806  def __init__(self):
807    self._module_name_to_description_map = dict()
808
809  def add_module(self, desc):
810    assert isinstance(desc, ModuleDesc)
811    assert desc.name not in self._module_name_to_description_map
812    self._module_name_to_description_map[desc.name] = desc
813
814  def find_module_by_name(self, name):
815    return self._module_name_to_description_map.get(name, None)
816
817  def __str__(self):
818    s = '{} modules:\n'.format(self.num_modules)
819    for module_desc in sorted(self._module_name_to_description_map.values(), key=lambda v: v.start_addr):
820      s += str(module_desc) + '\n'
821    return s
822
823  @property
824  def num_modules(self):
825    return len(self._module_name_to_description_map)
826
827  @property
828  def modules(self):
829    return set(self._module_name_to_description_map.values())
830
831  def get_module_path_for_symbolication(self, module_name, proxy, validate_uuid):
832    module_desc = self.find_module_by_name(module_name)
833    if module_desc is None:
834      return None
835    # Allow a plug-in to change the module description to make it
836    # suitable for symbolication or avoid symbolication altogether.
837    module_desc = proxy.filter_module_desc(module_desc)
838    if module_desc is None:
839      return None
840    if validate_uuid:
841      logging.debug('Validating UUID of {}'.format(module_desc.module_path_for_symbolization))
842      try:
843        uuid = get_uuid_from_binary(module_desc.module_path_for_symbolization, arch = module_desc.arch)
844        if uuid != module_desc.uuid:
845          logging.warning("Detected UUID mismatch {} != {}".format(uuid, module_desc.uuid))
846          # UUIDs don't match. Tell client to not symbolize this.
847          return None
848      except GetUUIDFromBinaryException as e:
849        logging.error('Failed to get binary from UUID: %s', str(e))
850        return None
851    else:
852      logging.warning('Skipping validation of UUID of {}'.format(module_desc.module_path_for_symbolization))
853    return module_desc.module_path_for_symbolization
854
855  @staticmethod
856  def parse_from_file(module_map_path):
857    if not os.path.exists(module_map_path):
858      raise Exception('module map "{}" does not exist'.format(module_map_path))
859    with open(module_map_path, 'r') as f:
860      mm = None
861      # E.g.
862      # 0x2db4000-0x102ddc000 /path/to (arm64) <0D6BBDE0-FF90-3680-899D-8E6F9528E04C>
863      hex_regex = lambda name: r'0x(?P<' + name + r'>[0-9a-f]+)'
864      module_path_regex = r'(?P<path>.+)'
865      arch_regex = r'\((?P<arch>.+)\)'
866      uuid_regex = r'<(?P<uuid>[0-9A-Z-]+)>'
867      line_regex = r'^{}-{}\s+{}\s+{}\s+{}'.format(
868        hex_regex('start_addr'),
869        hex_regex('end_addr'),
870        module_path_regex,
871        arch_regex,
872        uuid_regex
873      )
874      matcher = re.compile(line_regex)
875      line_num = 0
876      line = 'dummy'
877      while line != '':
878        line = f.readline()
879        line_num += 1
880        if mm is None:
881          if line.startswith('Process module map:'):
882            mm = ModuleMap()
883          continue
884        if line.startswith('End of module map'):
885          break
886        m_obj = matcher.match(line)
887        if not m_obj:
888          raise Exception('Failed to parse line {} "{}"'.format(line_num, line))
889        arch = m_obj.group('arch')
890        start_addr = int(m_obj.group('start_addr'), base=16)
891        end_addr = int(m_obj.group('end_addr'), base=16)
892        module_path = m_obj.group('path')
893        uuid = m_obj.group('uuid')
894        module_desc = ModuleDesc(
895          name=os.path.basename(module_path),
896          arch=arch,
897          start_addr=start_addr,
898          end_addr=end_addr,
899          module_path=module_path,
900          uuid=uuid
901        )
902        mm.add_module(module_desc)
903      if mm is not None:
904        logging.debug('Loaded Module map from "{}":\n{}'.format(
905          f.name,
906          str(mm))
907        )
908      return mm
909
910class SysRootFilterPlugIn(AsanSymbolizerPlugIn):
911  """
912    Simple plug-in to add sys root prefix to all binary paths
913    used for symbolication.
914  """
915  def __init__(self):
916    self.sysroot_path = ""
917
918  def register_cmdline_args(self, parser):
919    parser.add_argument('-s', dest='sys_root', metavar='SYSROOT',
920                      help='set path to sysroot for sanitized binaries')
921
922  def process_cmdline_args(self, pargs):
923    if pargs.sys_root is None:
924      # Not being used so remove ourselves.
925      return False
926    self.sysroot_path = pargs.sys_root
927    return True
928
929  def filter_binary_path(self, path):
930    return self.sysroot_path + path
931
932class ModuleMapPlugIn(AsanSymbolizerPlugIn):
933  def __init__(self):
934    self._module_map = None
935    self._uuid_validation = True
936  def register_cmdline_args(self, parser):
937    parser.add_argument('--module-map',
938                        help='Path to text file containing module map'
939                        'output. See print_module_map ASan option.')
940    parser.add_argument('--skip-uuid-validation',
941                        default=False,
942                        action='store_true',
943                        help='Skips validating UUID of modules using otool.')
944
945  def process_cmdline_args(self, pargs):
946    if not pargs.module_map:
947      return False
948    self._module_map = ModuleMap.parse_from_file(args.module_map)
949    if self._module_map is None:
950      msg = 'Failed to find module map'
951      logging.error(msg)
952      raise Exception(msg)
953    self._uuid_validation = not pargs.skip_uuid_validation
954    return True
955
956  def filter_binary_path(self, binary_path):
957    if os.path.isabs(binary_path):
958      # This is a binary path so transform into
959      # a module name
960      module_name = os.path.basename(binary_path)
961    else:
962      module_name = binary_path
963    return self._module_map.get_module_path_for_symbolication(
964      module_name,
965      self.proxy,
966      self._uuid_validation
967    )
968
969def add_logging_args(parser):
970  parser.add_argument('--log-dest',
971    default=None,
972    help='Destination path for script logging (default stderr).',
973  )
974  parser.add_argument('--log-level',
975    choices=['debug', 'info', 'warning', 'error', 'critical'],
976    default='info',
977    help='Log level for script (default: %(default)s).'
978  )
979
980def setup_logging():
981  # Set up a parser just for parsing the logging arguments.
982  # This is necessary because logging should be configured before we
983  # perform the main argument parsing.
984  parser = argparse.ArgumentParser(add_help=False)
985  add_logging_args(parser)
986  pargs, unparsed_args = parser.parse_known_args()
987
988  log_level = getattr(logging, pargs.log_level.upper())
989  if log_level == logging.DEBUG:
990    log_format = '%(levelname)s: [%(funcName)s() %(filename)s:%(lineno)d] %(message)s'
991  else:
992    log_format = '%(levelname)s: %(message)s'
993  basic_config = {
994    'level': log_level,
995    'format': log_format
996  }
997  log_dest = pargs.log_dest
998  if log_dest:
999    basic_config['filename'] = log_dest
1000  logging.basicConfig(**basic_config)
1001  logging.debug('Logging level set to "{}" and directing output to "{}"'.format(
1002    pargs.log_level,
1003    'stderr' if log_dest is None else log_dest)
1004  )
1005  return unparsed_args
1006
1007def add_load_plugin_args(parser):
1008  parser.add_argument('-p', '--plugins',
1009    help='Load plug-in', nargs='+', default=[])
1010
1011def setup_plugins(plugin_proxy, args):
1012  parser = argparse.ArgumentParser(add_help=False)
1013  add_load_plugin_args(parser)
1014  pargs , unparsed_args = parser.parse_known_args()
1015  for plugin_path in pargs.plugins:
1016    plugin_proxy.load_plugin_from_file(plugin_path)
1017  # Add built-in plugins.
1018  plugin_proxy.add_plugin(ModuleMapPlugIn())
1019  plugin_proxy.add_plugin(SysRootFilterPlugIn())
1020  return unparsed_args
1021
1022if __name__ == '__main__':
1023  remaining_args = setup_logging()
1024  with AsanSymbolizerPlugInProxy() as plugin_proxy:
1025    remaining_args = setup_plugins(plugin_proxy, remaining_args)
1026    parser = argparse.ArgumentParser(
1027        formatter_class=argparse.RawDescriptionHelpFormatter,
1028        description='ASan symbolization script',
1029        epilog=__doc__)
1030    parser.add_argument('path_to_cut', nargs='*',
1031                        help='pattern to be cut from the result file path ')
1032    parser.add_argument('-d','--demangle', action='store_true',
1033                        help='demangle function names')
1034    parser.add_argument('-c', metavar='CROSS_COMPILE',
1035                        help='set prefix for binutils')
1036    parser.add_argument('-l','--logfile', default=sys.stdin,
1037                        type=argparse.FileType('r'),
1038                        help='set log file name to parse, default is stdin')
1039    parser.add_argument('--force-system-symbolizer', action='store_true',
1040                        help='don\'t use llvm-symbolizer')
1041    # Add logging arguments so that `--help` shows them.
1042    add_logging_args(parser)
1043    # Add load plugin arguments so that `--help` shows them.
1044    add_load_plugin_args(parser)
1045    plugin_proxy.register_cmdline_args(parser)
1046    args = parser.parse_args(remaining_args)
1047    plugin_proxy.process_cmdline_args(args)
1048    if args.path_to_cut:
1049      fix_filename_patterns = args.path_to_cut
1050    if args.demangle:
1051      demangle = True
1052    if args.c:
1053      binutils_prefix = args.c
1054    if args.logfile:
1055      logfile = args.logfile
1056    else:
1057      logfile = sys.stdin
1058    if args.force_system_symbolizer:
1059      force_system_symbolizer = True
1060    if force_system_symbolizer:
1061      assert(allow_system_symbolizer)
1062    loop = SymbolizationLoop(plugin_proxy)
1063    loop.process_logfile()
1064