xref: /llvm-project/compiler-rt/lib/asan/scripts/asan_symbolize.py (revision 61353cc1f65f02477eedeebcb08e9193cbd53305)
1#!/usr/bin/env python
2# ===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
3#
4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5# See https://llvm.org/LICENSE.txt for license information.
6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7#
8# ===------------------------------------------------------------------------===#
9"""
10Example of use:
11  asan_symbolize.py -c "$HOME/opt/cross/bin/arm-linux-gnueabi-" -s "$HOME/SymbolFiles" < asan.log
12
13PLUGINS
14
15This script provides a way for external plug-ins to hook into the behaviour of
16various parts of this script (see `--plugins`). This is useful for situations
17where it is necessary to handle site-specific quirks (e.g. binaries with debug
18symbols only accessible via a remote service) without having to modify the
19script itself.
20
21"""
22import argparse
23import bisect
24import errno
25import getopt
26import logging
27import os
28import re
29import shutil
30import subprocess
31import sys
32
33symbolizers = {}
34demangle = False
35binutils_prefix = None
36fix_filename_patterns = None
37logfile = sys.stdin
38allow_system_symbolizer = True
39force_system_symbolizer = False
40
41# FIXME: merge the code that calls fix_filename().
42def fix_filename(file_name):
43    if fix_filename_patterns:
44        for path_to_cut in fix_filename_patterns:
45            file_name = re.sub(".*" + path_to_cut, "", file_name)
46    file_name = re.sub(".*asan_[a-z_]*.(cc|cpp):[0-9]*", "_asan_rtl_", file_name)
47    file_name = re.sub(".*crtstuff.c:0", "???:0", file_name)
48    return file_name
49
50
51def is_valid_arch(s):
52    return s in [
53        "i386",
54        "x86_64",
55        "x86_64h",
56        "arm",
57        "armv6",
58        "armv7",
59        "armv7s",
60        "armv7k",
61        "arm64",
62        "powerpc64",
63        "powerpc64le",
64        "s390x",
65        "s390",
66        "riscv64",
67        "loongarch64",
68    ]
69
70
71def guess_arch(addr):
72    # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
73    if len(addr) > 10:
74        return "x86_64"
75    else:
76        return "i386"
77
78
79class Symbolizer(object):
80    def __init__(self):
81        pass
82
83    def symbolize(self, addr, binary, offset):
84        """Symbolize the given address (pair of binary and offset).
85
86        Overriden in subclasses.
87        Args:
88            addr: virtual address of an instruction.
89            binary: path to executable/shared object containing this instruction.
90            offset: instruction offset in the @binary.
91        Returns:
92            list of strings (one string for each inlined frame) describing
93            the code locations for this instruction (that is, function name, file
94            name, line and column numbers).
95        """
96        return None
97
98
99class LLVMSymbolizer(Symbolizer):
100    def __init__(self, symbolizer_path, default_arch, system, dsym_hints=[]):
101        super(LLVMSymbolizer, self).__init__()
102        self.symbolizer_path = symbolizer_path
103        self.default_arch = default_arch
104        self.system = system
105        self.dsym_hints = dsym_hints
106        self.pipe = self.open_llvm_symbolizer()
107
108    def open_llvm_symbolizer(self):
109        cmd = [
110            self.symbolizer_path,
111            ("--demangle" if demangle else "--no-demangle"),
112            "--functions=linkage",
113            "--inlines",
114            "--default-arch=%s" % self.default_arch,
115        ]
116        if self.system == "Darwin":
117            for hint in self.dsym_hints:
118                cmd.append("--dsym-hint=%s" % hint)
119        logging.debug(" ".join(cmd))
120        try:
121            result = subprocess.Popen(
122                cmd,
123                stdin=subprocess.PIPE,
124                stdout=subprocess.PIPE,
125                bufsize=0,
126                universal_newlines=True,
127            )
128        except OSError:
129            result = None
130        return result
131
132    def symbolize(self, addr, binary, offset):
133        """Overrides Symbolizer.symbolize."""
134        if not self.pipe:
135            return None
136        result = []
137        try:
138            symbolizer_input = '"%s" %s' % (binary, offset)
139            logging.debug(symbolizer_input)
140            self.pipe.stdin.write("%s\n" % symbolizer_input)
141            while True:
142                function_name = self.pipe.stdout.readline().rstrip()
143                if not function_name:
144                    break
145                file_name = self.pipe.stdout.readline().rstrip()
146                file_name = fix_filename(file_name)
147                if not function_name.startswith("??") or not file_name.startswith("??"):
148                    # Append only non-trivial frames.
149                    result.append("%s in %s %s" % (addr, function_name, file_name))
150        except Exception:
151            result = []
152        if not result:
153            result = None
154        return result
155
156
157def LLVMSymbolizerFactory(system, default_arch, dsym_hints=[]):
158    symbolizer_path = os.getenv("LLVM_SYMBOLIZER_PATH")
159    if not symbolizer_path:
160        symbolizer_path = os.getenv("ASAN_SYMBOLIZER_PATH")
161        if not symbolizer_path:
162            # Assume llvm-symbolizer is in PATH.
163            symbolizer_path = "llvm-symbolizer"
164    return LLVMSymbolizer(symbolizer_path, default_arch, system, dsym_hints)
165
166
167class Addr2LineSymbolizer(Symbolizer):
168    def __init__(self, binary):
169        super(Addr2LineSymbolizer, self).__init__()
170        self.binary = binary
171        self.pipe = self.open_addr2line()
172        self.output_terminator = -1
173
174    def open_addr2line(self):
175        addr2line_tool = "addr2line"
176        if binutils_prefix:
177            addr2line_tool = binutils_prefix + addr2line_tool
178        logging.debug("addr2line binary is %s" % shutil.which(addr2line_tool))
179        cmd = [addr2line_tool, "-fi"]
180        if demangle:
181            cmd += ["--demangle"]
182        cmd += ["-e", self.binary]
183        logging.debug(" ".join(cmd))
184        return subprocess.Popen(
185            cmd,
186            stdin=subprocess.PIPE,
187            stdout=subprocess.PIPE,
188            bufsize=0,
189            universal_newlines=True,
190        )
191
192    def symbolize(self, addr, binary, offset):
193        """Overrides Symbolizer.symbolize."""
194        if self.binary != binary:
195            return None
196        lines = []
197        try:
198            self.pipe.stdin.write("%s\n" % offset)
199            self.pipe.stdin.write("%s\n" % self.output_terminator)
200            is_first_frame = True
201            while True:
202                function_name = self.pipe.stdout.readline().rstrip()
203                logging.debug("read function_name='%s' from addr2line" % function_name)
204                # If llvm-symbolizer is installed as addr2line, older versions of
205                # llvm-symbolizer will print -1 when presented with -1 and not print
206                # a second line. In that case we will block for ever trying to read the
207                # file name. This also happens for non-existent files, in which case GNU
208                # addr2line exits immediate, but llvm-symbolizer does not (see
209                # https://llvm.org/PR42754).
210                if function_name == "-1":
211                    logging.debug("got function '-1' -> no more input")
212                    break
213                file_name = self.pipe.stdout.readline().rstrip()
214                logging.debug("read file_name='%s' from addr2line" % file_name)
215                if is_first_frame:
216                    is_first_frame = False
217                elif function_name == "??":
218                    assert file_name == "??:0", file_name
219                    logging.debug("got function '??' -> no more input")
220                    break
221                elif not function_name:
222                    assert not file_name, file_name
223                    logging.debug("got empty function name -> no more input")
224                    break
225                if not function_name and not file_name:
226                    logging.debug(
227                        "got empty function and file name -> unknown function"
228                    )
229                    function_name = "??"
230                    file_name = "??:0"
231                lines.append((function_name, file_name))
232        except IOError as e:
233            # EPIPE happens if addr2line exits early (which some implementations do
234            # if an invalid file is passed).
235            if e.errno == errno.EPIPE:
236                logging.debug(
237                    f"addr2line exited early (broken pipe) returncode={self.pipe.poll()}"
238                )
239            else:
240                logging.debug(
241                    "unexpected I/O exception communicating with addr2line", exc_info=e
242                )
243            lines.append(("??", "??:0"))
244        except Exception as e:
245            logging.debug(
246                "got unknown exception communicating with addr2line", exc_info=e
247            )
248            lines.append(("??", "??:0"))
249        return [
250            "%s in %s %s" % (addr, function, fix_filename(file))
251            for (function, file) in lines
252        ]
253
254
255class UnbufferedLineConverter(object):
256    """
257    Wrap a child process that responds to each line of input with one line of
258    output.  Uses pty to trick the child into providing unbuffered output.
259    """
260
261    def __init__(self, args, close_stderr=False):
262        # Local imports so that the script can start on Windows.
263        import pty
264        import termios
265
266        pid, fd = pty.fork()
267        if pid == 0:
268            # We're the child. Transfer control to command.
269            if close_stderr:
270                dev_null = os.open("/dev/null", 0)
271                os.dup2(dev_null, 2)
272            os.execvp(args[0], args)
273        else:
274            # Disable echoing.
275            attr = termios.tcgetattr(fd)
276            attr[3] = attr[3] & ~termios.ECHO
277            termios.tcsetattr(fd, termios.TCSANOW, attr)
278            # Set up a file()-like interface to the child process
279            self.r = os.fdopen(fd, "r", 1)
280            self.w = os.fdopen(os.dup(fd), "w", 1)
281
282    def convert(self, line):
283        self.w.write(line + "\n")
284        return self.readline()
285
286    def readline(self):
287        return self.r.readline().rstrip()
288
289
290class DarwinSymbolizer(Symbolizer):
291    def __init__(self, addr, binary, arch):
292        super(DarwinSymbolizer, self).__init__()
293        self.binary = binary
294        self.arch = arch
295        self.open_atos()
296
297    def open_atos(self):
298        logging.debug("atos -o %s -arch %s", self.binary, self.arch)
299        cmdline = ["atos", "-o", self.binary, "-arch", self.arch]
300        self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
301
302    def symbolize(self, addr, binary, offset):
303        """Overrides Symbolizer.symbolize."""
304        if self.binary != binary:
305            return None
306        if not os.path.exists(binary):
307            # If the binary doesn't exist atos will exit which will lead to IOError
308            # exceptions being raised later on so just don't try to symbolize.
309            return ["{} ({}:{}+{})".format(addr, binary, self.arch, offset)]
310        atos_line = self.atos.convert("0x%x" % int(offset, 16))
311        while "got symbolicator for" in atos_line:
312            atos_line = self.atos.readline()
313        # A well-formed atos response looks like this:
314        #   foo(type1, type2) (in object.name) (filename.cc:80)
315        # NOTE:
316        #   * For C functions atos omits parentheses and argument types.
317        #   * For C++ functions the function name (i.e., `foo` above) may contain
318        #     templates which may contain parentheses.
319        match = re.match(r"^(.*) \(in (.*)\) \((.*:\d*)\)$", atos_line)
320        logging.debug("atos_line: %s", atos_line)
321        if match:
322            function_name = match.group(1)
323            file_name = fix_filename(match.group(3))
324            return ["%s in %s %s" % (addr, function_name, file_name)]
325        else:
326            return ["%s in %s" % (addr, atos_line)]
327
328
329# Chain several symbolizers so that if one symbolizer fails, we fall back
330# to the next symbolizer in chain.
331class ChainSymbolizer(Symbolizer):
332    def __init__(self, symbolizer_list):
333        super(ChainSymbolizer, self).__init__()
334        self.symbolizer_list = symbolizer_list
335
336    def symbolize(self, addr, binary, offset):
337        """Overrides Symbolizer.symbolize."""
338        for symbolizer in self.symbolizer_list:
339            if symbolizer:
340                result = symbolizer.symbolize(addr, binary, offset)
341                if result:
342                    return result
343        return None
344
345    def append_symbolizer(self, symbolizer):
346        self.symbolizer_list.append(symbolizer)
347
348
349def BreakpadSymbolizerFactory(binary):
350    suffix = os.getenv("BREAKPAD_SUFFIX")
351    if suffix:
352        filename = binary + suffix
353        if os.access(filename, os.F_OK):
354            return BreakpadSymbolizer(filename)
355    return None
356
357
358def SystemSymbolizerFactory(system, addr, binary, arch):
359    if system == "Darwin":
360        return DarwinSymbolizer(addr, binary, arch)
361    elif system in ["Linux", "FreeBSD", "NetBSD", "SunOS"]:
362        return Addr2LineSymbolizer(binary)
363
364
365class BreakpadSymbolizer(Symbolizer):
366    def __init__(self, filename):
367        super(BreakpadSymbolizer, self).__init__()
368        self.filename = filename
369        lines = file(filename).readlines()
370        self.files = []
371        self.symbols = {}
372        self.address_list = []
373        self.addresses = {}
374        # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
375        fragments = lines[0].rstrip().split()
376        self.arch = fragments[2]
377        self.debug_id = fragments[3]
378        self.binary = " ".join(fragments[4:])
379        self.parse_lines(lines[1:])
380
381    def parse_lines(self, lines):
382        cur_function_addr = ""
383        for line in lines:
384            fragments = line.split()
385            if fragments[0] == "FILE":
386                assert int(fragments[1]) == len(self.files)
387                self.files.append(" ".join(fragments[2:]))
388            elif fragments[0] == "PUBLIC":
389                self.symbols[int(fragments[1], 16)] = " ".join(fragments[3:])
390            elif fragments[0] in ["CFI", "STACK"]:
391                pass
392            elif fragments[0] == "FUNC":
393                cur_function_addr = int(fragments[1], 16)
394                if not cur_function_addr in self.symbols.keys():
395                    self.symbols[cur_function_addr] = " ".join(fragments[4:])
396            else:
397                # Line starting with an address.
398                addr = int(fragments[0], 16)
399                self.address_list.append(addr)
400                # Tuple of symbol address, size, line, file number.
401                self.addresses[addr] = (
402                    cur_function_addr,
403                    int(fragments[1], 16),
404                    int(fragments[2]),
405                    int(fragments[3]),
406                )
407        self.address_list.sort()
408
409    def get_sym_file_line(self, addr):
410        key = None
411        if addr in self.addresses.keys():
412            key = addr
413        else:
414            index = bisect.bisect_left(self.address_list, addr)
415            if index == 0:
416                return None
417            else:
418                key = self.address_list[index - 1]
419        sym_id, size, line_no, file_no = self.addresses[key]
420        symbol = self.symbols[sym_id]
421        filename = self.files[file_no]
422        if addr < key + size:
423            return symbol, filename, line_no
424        else:
425            return None
426
427    def symbolize(self, addr, binary, offset):
428        if self.binary != binary:
429            return None
430        res = self.get_sym_file_line(int(offset, 16))
431        if res:
432            function_name, file_name, line_no = res
433            result = ["%s in %s %s:%d" % (addr, function_name, file_name, line_no)]
434            print(result)
435            return result
436        else:
437            return None
438
439
440class SymbolizationLoop(object):
441    def __init__(self, plugin_proxy=None, dsym_hint_producer=None):
442        self.plugin_proxy = plugin_proxy
443        if sys.platform == "win32":
444            # ASan on Windows uses dbghelp.dll to symbolize in-process, which works
445            # even in sandboxed processes.  Nothing needs to be done here.
446            self.process_line = self.process_line_echo
447        else:
448            # Used by clients who may want to supply a different binary name.
449            # E.g. in Chrome several binaries may share a single .dSYM.
450            self.dsym_hint_producer = dsym_hint_producer
451            self.system = os.uname()[0]
452            if self.system not in ["Linux", "Darwin", "FreeBSD", "NetBSD", "SunOS"]:
453                raise Exception("Unknown system")
454            self.llvm_symbolizers = {}
455            self.last_llvm_symbolizer = None
456            self.dsym_hints = set([])
457            self.frame_no = 0
458            self.process_line = self.process_line_posix
459            self.using_module_map = plugin_proxy.has_plugin(ModuleMapPlugIn.get_name())
460
461    def symbolize_address(self, addr, binary, offset, arch):
462        # On non-Darwin (i.e. on platforms without .dSYM debug info) always use
463        # a single symbolizer binary.
464        # On Darwin, if the dsym hint producer is present:
465        #  1. check whether we've seen this binary already; if so,
466        #     use |llvm_symbolizers[binary]|, which has already loaded the debug
467        #     info for this binary (might not be the case for
468        #     |last_llvm_symbolizer|);
469        #  2. otherwise check if we've seen all the hints for this binary already;
470        #     if so, reuse |last_llvm_symbolizer| which has the full set of hints;
471        #  3. otherwise create a new symbolizer and pass all currently known
472        #     .dSYM hints to it.
473        result = None
474        if not force_system_symbolizer:
475            if not binary in self.llvm_symbolizers:
476                use_new_symbolizer = True
477                if self.system == "Darwin" and self.dsym_hint_producer:
478                    dsym_hints_for_binary = set(self.dsym_hint_producer(binary))
479                    use_new_symbolizer = bool(dsym_hints_for_binary - self.dsym_hints)
480                    self.dsym_hints |= dsym_hints_for_binary
481                if self.last_llvm_symbolizer and not use_new_symbolizer:
482                    self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
483                else:
484                    self.last_llvm_symbolizer = LLVMSymbolizerFactory(
485                        self.system, arch, self.dsym_hints
486                    )
487                    self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
488            # Use the chain of symbolizers:
489            # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
490            # (fall back to next symbolizer if the previous one fails).
491            if not binary in symbolizers:
492                symbolizers[binary] = ChainSymbolizer(
493                    [BreakpadSymbolizerFactory(binary), self.llvm_symbolizers[binary]]
494                )
495            result = symbolizers[binary].symbolize(addr, binary, offset)
496        else:
497            symbolizers[binary] = ChainSymbolizer([])
498        if result is None:
499            if not allow_system_symbolizer:
500                raise Exception("Failed to launch or use llvm-symbolizer.")
501            # Initialize system symbolizer only if other symbolizers failed.
502            symbolizers[binary].append_symbolizer(
503                SystemSymbolizerFactory(self.system, addr, binary, arch)
504            )
505            result = symbolizers[binary].symbolize(addr, binary, offset)
506        # The system symbolizer must produce some result.
507        assert result
508        return result
509
510    def get_symbolized_lines(self, symbolized_lines, inc_frame_counter=True):
511        if not symbolized_lines:
512            if inc_frame_counter:
513                self.frame_no += 1
514            return [self.current_line]
515        else:
516            assert inc_frame_counter
517            result = []
518            for symbolized_frame in symbolized_lines:
519                result.append(
520                    "    #%s %s" % (str(self.frame_no), symbolized_frame.rstrip())
521                )
522                self.frame_no += 1
523            return result
524
525    def process_logfile(self):
526        self.frame_no = 0
527        for line in logfile:
528            processed = self.process_line(line)
529            print("\n".join(processed))
530
531    def process_line_echo(self, line):
532        return [line.rstrip()]
533
534    def process_line_posix(self, line):
535        self.current_line = line.rstrip()
536        # Unsymbolicated:
537        # #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
538        # Partially symbolicated:
539        # #0 0x7f6e35cf2e45 in foo (foo.so+0x11fe45)
540        # NOTE: We have to very liberal with symbol
541        # names in the regex because it could be an
542        # Objective-C or C++ demangled name.
543        stack_trace_line_format = (
544            r"^( *#([0-9]+) *)(0x[0-9a-f]+) *(?:in *.+)? *\((.*)\+(0x[0-9a-f]+)\)"
545        )
546        match = re.match(stack_trace_line_format, line)
547        if not match:
548            logging.debug('Line "{}" does not match regex'.format(line))
549            # Not a frame line so don't increment the frame counter.
550            return self.get_symbolized_lines(None, inc_frame_counter=False)
551        logging.debug(line)
552        _, frameno_str, addr, binary, offset = match.groups()
553
554        if not self.using_module_map and not os.path.isabs(binary):
555            # Do not try to symbolicate if the binary is just the module file name
556            # and a module map is unavailable.
557            # FIXME(dliew): This is currently necessary for reports on Darwin that are
558            # partially symbolicated by `atos`.
559            return self.get_symbolized_lines(None)
560        arch = ""
561        # Arch can be embedded in the filename, e.g.: "libabc.dylib:x86_64h"
562        colon_pos = binary.rfind(":")
563        if colon_pos != -1:
564            maybe_arch = binary[colon_pos + 1 :]
565            if is_valid_arch(maybe_arch):
566                arch = maybe_arch
567                binary = binary[0:colon_pos]
568        if arch == "":
569            arch = guess_arch(addr)
570        if frameno_str == "0":
571            # Assume that frame #0 is the first frame of new stack trace.
572            self.frame_no = 0
573        original_binary = binary
574        binary = self.plugin_proxy.filter_binary_path(binary)
575        if binary is None:
576            # The binary filter has told us this binary can't be symbolized.
577            logging.debug('Skipping symbolication of binary "%s"', original_binary)
578            return self.get_symbolized_lines(None)
579        symbolized_line = self.symbolize_address(addr, binary, offset, arch)
580        if not symbolized_line:
581            if original_binary != binary:
582                symbolized_line = self.symbolize_address(
583                    addr, original_binary, offset, arch
584                )
585        return self.get_symbolized_lines(symbolized_line)
586
587
588class AsanSymbolizerPlugInProxy(object):
589    """
590    Serves several purposes:
591    - Manages the lifetime of plugins (must be used a `with` statement).
592    - Provides interface for calling into plugins from within this script.
593    """
594
595    def __init__(self):
596        self._plugins = []
597        self._plugin_names = set()
598
599    def _load_plugin_from_file_impl_py_gt_2(self, file_path, globals_space):
600        with open(file_path, "r") as f:
601            exec(f.read(), globals_space, None)
602
603    def load_plugin_from_file(self, file_path):
604        logging.info('Loading plugins from "{}"'.format(file_path))
605        globals_space = dict(globals())
606        # Provide function to register plugins
607        def register_plugin(plugin):
608            logging.info("Registering plugin %s", plugin.get_name())
609            self.add_plugin(plugin)
610
611        globals_space["register_plugin"] = register_plugin
612        if sys.version_info.major < 3:
613            execfile(file_path, globals_space, None)
614        else:
615            # Indirection here is to avoid a bug in older Python 2 versions:
616            # `SyntaxError: unqualified exec is not allowed in function ...`
617            self._load_plugin_from_file_impl_py_gt_2(file_path, globals_space)
618
619    def add_plugin(self, plugin):
620        assert isinstance(plugin, AsanSymbolizerPlugIn)
621        self._plugins.append(plugin)
622        self._plugin_names.add(plugin.get_name())
623        plugin._receive_proxy(self)
624
625    def remove_plugin(self, plugin):
626        assert isinstance(plugin, AsanSymbolizerPlugIn)
627        self._plugins.remove(plugin)
628        self._plugin_names.remove(plugin.get_name())
629        logging.debug("Removing plugin %s", plugin.get_name())
630        plugin.destroy()
631
632    def has_plugin(self, name):
633        """
634        Returns true iff the plugin name is currently
635        being managed by AsanSymbolizerPlugInProxy.
636        """
637        return name in self._plugin_names
638
639    def register_cmdline_args(self, parser):
640        plugins = list(self._plugins)
641        for plugin in plugins:
642            plugin.register_cmdline_args(parser)
643
644    def process_cmdline_args(self, pargs):
645        # Use copy so we can remove items as we iterate.
646        plugins = list(self._plugins)
647        for plugin in plugins:
648            keep = plugin.process_cmdline_args(pargs)
649            assert isinstance(keep, bool)
650            if not keep:
651                self.remove_plugin(plugin)
652
653    def __enter__(self):
654        return self
655
656    def __exit__(self, exc_type, exc_val, exc_tb):
657        for plugin in self._plugins:
658            plugin.destroy()
659        # Don't suppress raised exceptions
660        return False
661
662    def _filter_single_value(self, function_name, input_value):
663        """
664        Helper for filter style plugin functions.
665        """
666        new_value = input_value
667        for plugin in self._plugins:
668            result = getattr(plugin, function_name)(new_value)
669            if result is None:
670                return None
671            new_value = result
672        return new_value
673
674    def filter_binary_path(self, binary_path):
675        """
676        Consult available plugins to filter the path to a binary
677        to make it suitable for symbolication.
678
679        Returns `None` if symbolication should not be attempted for this
680        binary.
681        """
682        return self._filter_single_value("filter_binary_path", binary_path)
683
684    def filter_module_desc(self, module_desc):
685        """
686        Consult available plugins to determine the module
687        description suitable for symbolication.
688
689        Returns `None` if symbolication should not be attempted for this module.
690        """
691        assert isinstance(module_desc, ModuleDesc)
692        return self._filter_single_value("filter_module_desc", module_desc)
693
694
695class AsanSymbolizerPlugIn(object):
696    """
697    This is the interface the `asan_symbolize.py` code uses to talk
698    to plugins.
699    """
700
701    @classmethod
702    def get_name(cls):
703        """
704        Returns the name of the plugin.
705        """
706        return cls.__name__
707
708    def _receive_proxy(self, proxy):
709        assert isinstance(proxy, AsanSymbolizerPlugInProxy)
710        self.proxy = proxy
711
712    def register_cmdline_args(self, parser):
713        """
714        Hook for registering command line arguments to be
715        consumed in `process_cmdline_args()`.
716
717        `parser` - Instance of `argparse.ArgumentParser`.
718        """
719        pass
720
721    def process_cmdline_args(self, pargs):
722        """
723        Hook for handling parsed arguments. Implementations
724        should not modify `pargs`.
725
726        `pargs` - Instance of `argparse.Namespace` containing
727        parsed command line arguments.
728
729        Return `True` if plug-in should be used, otherwise
730        return `False`.
731        """
732        return True
733
734    def destroy(self):
735        """
736        Hook called when a plugin is about to be destroyed.
737        Implementations should free any allocated resources here.
738        """
739        pass
740
741    # Symbolization hooks
742    def filter_binary_path(self, binary_path):
743        """
744        Given a binary path return a binary path suitable for symbolication.
745
746        Implementations should return `None` if symbolication of this binary
747        should be skipped.
748        """
749        return binary_path
750
751    def filter_module_desc(self, module_desc):
752        """
753        Given a ModuleDesc object (`module_desc`) return
754        a ModuleDesc suitable for symbolication.
755
756        Implementations should return `None` if symbolication of this binary
757        should be skipped.
758        """
759        return module_desc
760
761
762class ModuleDesc(object):
763    def __init__(self, name, arch, start_addr, end_addr, module_path, uuid):
764        self.name = name
765        self.arch = arch
766        self.start_addr = start_addr
767        self.end_addr = end_addr
768        # Module path from an ASan report.
769        self.module_path = module_path
770        # Module for performing symbolization, by default same as above.
771        self.module_path_for_symbolization = module_path
772        self.uuid = uuid
773        assert self.is_valid()
774
775    def __str__(self):
776        assert self.is_valid()
777        return "{name} {arch} {start_addr:#016x}-{end_addr:#016x} {module_path} {uuid}".format(
778            name=self.name,
779            arch=self.arch,
780            start_addr=self.start_addr,
781            end_addr=self.end_addr,
782            module_path=self.module_path
783            if self.module_path == self.module_path_for_symbolization
784            else "{} ({})".format(self.module_path_for_symbolization, self.module_path),
785            uuid=self.uuid,
786        )
787
788    def is_valid(self):
789        if not isinstance(self.name, str):
790            return False
791        if not isinstance(self.arch, str):
792            return False
793        if not isinstance(self.start_addr, int):
794            return False
795        if self.start_addr < 0:
796            return False
797        if not isinstance(self.end_addr, int):
798            return False
799        if self.end_addr <= self.start_addr:
800            return False
801        if not isinstance(self.module_path, str):
802            return False
803        if not os.path.isabs(self.module_path):
804            return False
805        if not isinstance(self.module_path_for_symbolization, str):
806            return False
807        if not os.path.isabs(self.module_path_for_symbolization):
808            return False
809        if not isinstance(self.uuid, str):
810            return False
811        return True
812
813
814class GetUUIDFromBinaryException(Exception):
815    def __init__(self, msg):
816        super(GetUUIDFromBinaryException, self).__init__(msg)
817
818
819_get_uuid_from_binary_cache = dict()
820
821
822def get_uuid_from_binary(path_to_binary, arch=None):
823    cache_key = (path_to_binary, arch)
824    cached_value = _get_uuid_from_binary_cache.get(cache_key)
825    if cached_value:
826        return cached_value
827    if not os.path.exists(path_to_binary):
828        raise GetUUIDFromBinaryException(
829            'Binary "{}" does not exist'.format(path_to_binary)
830        )
831    cmd = ["/usr/bin/otool", "-l"]
832    if arch:
833        cmd.extend(["-arch", arch])
834    cmd.append(path_to_binary)
835    output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
836    # Look for this output:
837    # cmd LC_UUID
838    # cmdsize 24
839    # uuid 4CA778FE-5BF9-3C45-AE59-7DF01B2BE83F
840    if isinstance(output, str):
841        output_str = output
842    else:
843        assert isinstance(output, bytes)
844        output_str = output.decode()
845    assert isinstance(output_str, str)
846    lines = output_str.split("\n")
847    uuid = None
848    for index, line in enumerate(lines):
849        stripped_line = line.strip()
850        if not stripped_line.startswith("cmd LC_UUID"):
851            continue
852        uuid_line = lines[index + 2].strip()
853        if not uuid_line.startswith("uuid"):
854            raise GetUUIDFromBinaryException('Malformed output: "{}"'.format(uuid_line))
855        split_uuid_line = uuid_line.split()
856        uuid = split_uuid_line[1]
857        break
858    if uuid is None:
859        logging.error("Failed to retrieve UUID from binary {}".format(path_to_binary))
860        logging.error("otool output was:\n{}".format(output_str))
861        raise GetUUIDFromBinaryException(
862            'Failed to retrieve UUID from binary "{}"'.format(path_to_binary)
863        )
864    else:
865        # Update cache
866        _get_uuid_from_binary_cache[cache_key] = uuid
867    return uuid
868
869
870class ModuleMap(object):
871    def __init__(self):
872        self._module_name_to_description_map = dict()
873
874    def add_module(self, desc):
875        assert isinstance(desc, ModuleDesc)
876        assert desc.name not in self._module_name_to_description_map
877        self._module_name_to_description_map[desc.name] = desc
878
879    def find_module_by_name(self, name):
880        return self._module_name_to_description_map.get(name, None)
881
882    def __str__(self):
883        s = "{} modules:\n".format(self.num_modules)
884        for module_desc in sorted(
885            self._module_name_to_description_map.values(), key=lambda v: v.start_addr
886        ):
887            s += str(module_desc) + "\n"
888        return s
889
890    @property
891    def num_modules(self):
892        return len(self._module_name_to_description_map)
893
894    @property
895    def modules(self):
896        return set(self._module_name_to_description_map.values())
897
898    def get_module_path_for_symbolication(self, module_name, proxy, validate_uuid):
899        module_desc = self.find_module_by_name(module_name)
900        if module_desc is None:
901            return None
902        # Allow a plug-in to change the module description to make it
903        # suitable for symbolication or avoid symbolication altogether.
904        module_desc = proxy.filter_module_desc(module_desc)
905        if module_desc is None:
906            return None
907        if validate_uuid:
908            logging.debug(
909                "Validating UUID of {}".format(
910                    module_desc.module_path_for_symbolization
911                )
912            )
913            try:
914                uuid = get_uuid_from_binary(
915                    module_desc.module_path_for_symbolization, arch=module_desc.arch
916                )
917                if uuid != module_desc.uuid:
918                    logging.warning(
919                        "Detected UUID mismatch {} != {}".format(uuid, module_desc.uuid)
920                    )
921                    # UUIDs don't match. Tell client to not symbolize this.
922                    return None
923            except GetUUIDFromBinaryException as e:
924                logging.error("Failed to get binary from UUID: %s", str(e))
925                return None
926        else:
927            logging.warning(
928                "Skipping validation of UUID of {}".format(
929                    module_desc.module_path_for_symbolization
930                )
931            )
932        return module_desc.module_path_for_symbolization
933
934    @staticmethod
935    def parse_from_file(module_map_path):
936        if not os.path.exists(module_map_path):
937            raise Exception('module map "{}" does not exist'.format(module_map_path))
938        with open(module_map_path, "r") as f:
939            mm = None
940            # E.g.
941            # 0x2db4000-0x102ddc000 /path/to (arm64) <0D6BBDE0-FF90-3680-899D-8E6F9528E04C>
942            hex_regex = lambda name: r"0x(?P<" + name + r">[0-9a-f]+)"
943            module_path_regex = r"(?P<path>.+)"
944            arch_regex = r"\((?P<arch>.+)\)"
945            uuid_regex = r"<(?P<uuid>[0-9A-Z-]+)>"
946            line_regex = r"^{}-{}\s+{}\s+{}\s+{}".format(
947                hex_regex("start_addr"),
948                hex_regex("end_addr"),
949                module_path_regex,
950                arch_regex,
951                uuid_regex,
952            )
953            matcher = re.compile(line_regex)
954            line_num = 0
955            line = "dummy"
956            while line != "":
957                line = f.readline()
958                line_num += 1
959                if mm is None:
960                    if line.startswith("Process module map:"):
961                        mm = ModuleMap()
962                    continue
963                if line.startswith("End of module map"):
964                    break
965                m_obj = matcher.match(line)
966                if not m_obj:
967                    raise Exception(
968                        'Failed to parse line {} "{}"'.format(line_num, line)
969                    )
970                arch = m_obj.group("arch")
971                start_addr = int(m_obj.group("start_addr"), base=16)
972                end_addr = int(m_obj.group("end_addr"), base=16)
973                module_path = m_obj.group("path")
974                uuid = m_obj.group("uuid")
975                module_desc = ModuleDesc(
976                    name=os.path.basename(module_path),
977                    arch=arch,
978                    start_addr=start_addr,
979                    end_addr=end_addr,
980                    module_path=module_path,
981                    uuid=uuid,
982                )
983                mm.add_module(module_desc)
984            if mm is not None:
985                logging.debug(
986                    'Loaded Module map from "{}":\n{}'.format(f.name, str(mm))
987                )
988            return mm
989
990
991class SysRootFilterPlugIn(AsanSymbolizerPlugIn):
992    """
993    Simple plug-in to add sys root prefix to all binary paths
994    used for symbolication.
995    """
996
997    def __init__(self):
998        self.sysroot_path = ""
999
1000    def register_cmdline_args(self, parser):
1001        parser.add_argument(
1002            "-s",
1003            dest="sys_root",
1004            metavar="SYSROOT",
1005            help="set path to sysroot for sanitized binaries",
1006        )
1007
1008    def process_cmdline_args(self, pargs):
1009        if pargs.sys_root is None:
1010            # Not being used so remove ourselves.
1011            return False
1012        self.sysroot_path = pargs.sys_root
1013        return True
1014
1015    def filter_binary_path(self, path):
1016        return self.sysroot_path + path
1017
1018
1019class ModuleMapPlugIn(AsanSymbolizerPlugIn):
1020    def __init__(self):
1021        self._module_map = None
1022        self._uuid_validation = True
1023
1024    def register_cmdline_args(self, parser):
1025        parser.add_argument(
1026            "--module-map",
1027            help="Path to text file containing module map"
1028            "output. See print_module_map ASan option.",
1029        )
1030        parser.add_argument(
1031            "--skip-uuid-validation",
1032            default=False,
1033            action="store_true",
1034            help="Skips validating UUID of modules using otool.",
1035        )
1036
1037    def process_cmdline_args(self, pargs):
1038        if not pargs.module_map:
1039            return False
1040        self._module_map = ModuleMap.parse_from_file(args.module_map)
1041        if self._module_map is None:
1042            msg = "Failed to find module map"
1043            logging.error(msg)
1044            raise Exception(msg)
1045        self._uuid_validation = not pargs.skip_uuid_validation
1046        return True
1047
1048    def filter_binary_path(self, binary_path):
1049        if os.path.isabs(binary_path):
1050            # This is a binary path so transform into
1051            # a module name
1052            module_name = os.path.basename(binary_path)
1053        else:
1054            module_name = binary_path
1055        return self._module_map.get_module_path_for_symbolication(
1056            module_name, self.proxy, self._uuid_validation
1057        )
1058
1059
1060def add_logging_args(parser):
1061    parser.add_argument(
1062        "--log-dest",
1063        default=None,
1064        help="Destination path for script logging (default stderr).",
1065    )
1066    parser.add_argument(
1067        "--log-level",
1068        choices=["debug", "info", "warning", "error", "critical"],
1069        default="info",
1070        help="Log level for script (default: %(default)s).",
1071    )
1072
1073
1074def setup_logging():
1075    # Set up a parser just for parsing the logging arguments.
1076    # This is necessary because logging should be configured before we
1077    # perform the main argument parsing.
1078    parser = argparse.ArgumentParser(add_help=False)
1079    add_logging_args(parser)
1080    pargs, unparsed_args = parser.parse_known_args()
1081
1082    log_level = getattr(logging, pargs.log_level.upper())
1083    if log_level == logging.DEBUG:
1084        log_format = (
1085            "%(levelname)s: [%(funcName)s() %(filename)s:%(lineno)d] %(message)s"
1086        )
1087    else:
1088        log_format = "%(levelname)s: %(message)s"
1089    basic_config = {"level": log_level, "format": log_format}
1090    log_dest = pargs.log_dest
1091    if log_dest:
1092        basic_config["filename"] = log_dest
1093    logging.basicConfig(**basic_config)
1094    logging.debug(
1095        'Logging level set to "{}" and directing output to "{}"'.format(
1096            pargs.log_level, "stderr" if log_dest is None else log_dest
1097        )
1098    )
1099    return unparsed_args
1100
1101
1102def add_load_plugin_args(parser):
1103    parser.add_argument("-p", "--plugins", help="Load plug-in", nargs="+", default=[])
1104
1105
1106def setup_plugins(plugin_proxy, args):
1107    parser = argparse.ArgumentParser(add_help=False)
1108    add_load_plugin_args(parser)
1109    pargs, unparsed_args = parser.parse_known_args()
1110    for plugin_path in pargs.plugins:
1111        plugin_proxy.load_plugin_from_file(plugin_path)
1112    # Add built-in plugins.
1113    plugin_proxy.add_plugin(ModuleMapPlugIn())
1114    plugin_proxy.add_plugin(SysRootFilterPlugIn())
1115    return unparsed_args
1116
1117
1118if __name__ == "__main__":
1119    remaining_args = setup_logging()
1120    with AsanSymbolizerPlugInProxy() as plugin_proxy:
1121        remaining_args = setup_plugins(plugin_proxy, remaining_args)
1122        parser = argparse.ArgumentParser(
1123            formatter_class=argparse.RawDescriptionHelpFormatter,
1124            description="ASan symbolization script",
1125            epilog=__doc__,
1126        )
1127        parser.add_argument(
1128            "path_to_cut",
1129            nargs="*",
1130            help="pattern to be cut from the result file path ",
1131        )
1132        parser.add_argument(
1133            "-d", "--demangle", action="store_true", help="demangle function names"
1134        )
1135        parser.add_argument(
1136            "-c", metavar="CROSS_COMPILE", help="set prefix for binutils"
1137        )
1138        parser.add_argument(
1139            "-l",
1140            "--logfile",
1141            default=sys.stdin,
1142            type=argparse.FileType("r"),
1143            help="set log file name to parse, default is stdin",
1144        )
1145        parser.add_argument(
1146            "--force-system-symbolizer",
1147            action="store_true",
1148            help="don't use llvm-symbolizer",
1149        )
1150        # Add logging arguments so that `--help` shows them.
1151        add_logging_args(parser)
1152        # Add load plugin arguments so that `--help` shows them.
1153        add_load_plugin_args(parser)
1154        plugin_proxy.register_cmdline_args(parser)
1155        args = parser.parse_args(remaining_args)
1156        plugin_proxy.process_cmdline_args(args)
1157        if args.path_to_cut:
1158            fix_filename_patterns = args.path_to_cut
1159        if args.demangle:
1160            demangle = True
1161        if args.c:
1162            binutils_prefix = args.c
1163        if args.logfile:
1164            logfile = args.logfile
1165        else:
1166            logfile = sys.stdin
1167        if args.force_system_symbolizer:
1168            force_system_symbolizer = True
1169        if force_system_symbolizer:
1170            assert allow_system_symbolizer
1171        loop = SymbolizationLoop(plugin_proxy)
1172        loop.process_logfile()
1173