xref: /llvm-project/compiler-rt/lib/hwasan/scripts/hwasan_symbolize (revision 61353cc1f65f02477eedeebcb08e9193cbd53305)
1#!/usr/bin/env python3
2#===- lib/hwasan/scripts/hwasan_symbolize ----------------------------------===#
3#
4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5# See https:#llvm.org/LICENSE.txt for license information.
6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7#
8#===------------------------------------------------------------------------===#
9#
10# HWAddressSanitizer offline symbolization script.
11#
12#===------------------------------------------------------------------------===#
13
14from __future__ import print_function
15from __future__ import unicode_literals
16
17import argparse
18import glob
19import html
20import json
21import mmap
22import os
23import re
24import struct
25import subprocess
26import sys
27
28if sys.version_info.major < 3:
29  # Simulate Python 3.x behaviour of defaulting to UTF-8 for print. This is
30  # important in case any symbols are non-ASCII.
31  import codecs
32  sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
33
34# Below, a parser for a subset of ELF. It only supports 64 bit, little-endian,
35# and only parses what is necessary to find the build ids. It uses a memoryview
36# into an mmap to avoid copying.
37Ehdr_size = 64
38e_shnum_offset = 60
39e_shoff_offset = 40
40
41Shdr_size = 64
42sh_type_offset = 4
43sh_offset_offset = 24
44sh_size_offset = 32
45SHT_NOTE = 7
46
47Nhdr_size = 12
48NT_GNU_BUILD_ID = 3
49
50def align_up(size, alignment):
51  return (size + alignment - 1) & ~(alignment - 1)
52
53def handle_Nhdr(mv, sh_size):
54  offset = 0
55  while offset < sh_size:
56    n_namesz, n_descsz, n_type = struct.unpack_from('<III', buffer=mv,
57                                                    offset=offset)
58    if (n_type == NT_GNU_BUILD_ID and n_namesz == 4 and
59        mv[offset + Nhdr_size: offset + Nhdr_size + 4] == b"GNU\x00"):
60      value = mv[offset + Nhdr_size + 4: offset + Nhdr_size + 4 + n_descsz]
61      return value.hex()
62    offset += Nhdr_size + align_up(n_namesz, 4) + align_up(n_descsz, 4)
63  return None
64
65def handle_Shdr(mv):
66  sh_type, = struct.unpack_from('<I', buffer=mv, offset=sh_type_offset)
67  if sh_type != SHT_NOTE:
68    return None, None
69  sh_offset, = struct.unpack_from('<Q', buffer=mv, offset=sh_offset_offset)
70  sh_size, = struct.unpack_from('<Q', buffer=mv, offset=sh_size_offset)
71  return sh_offset, sh_size
72
73def handle_elf(mv):
74  # \x02 is ELFCLASS64, \x01 is ELFDATA2LSB. HWASan currently only works on
75  # 64-bit little endian platforms (x86_64 and ARM64). If this changes, we will
76  # have to extend the parsing code.
77  if mv[:6] != b'\x7fELF\x02\x01':
78    return None
79  e_shnum, = struct.unpack_from('<H', buffer=mv, offset=e_shnum_offset)
80  e_shoff, = struct.unpack_from('<Q', buffer=mv, offset=e_shoff_offset)
81  for i in range(0, e_shnum):
82    start = e_shoff + i * Shdr_size
83    sh_offset, sh_size = handle_Shdr(mv[start: start + Shdr_size])
84    if sh_offset is None:
85      continue
86    note_hdr = mv[sh_offset: sh_offset + sh_size]
87    result = handle_Nhdr(note_hdr, sh_size)
88    if result is not None:
89      return result
90
91def get_buildid(filename):
92  with open(filename, "r") as fd:
93    if os.fstat(fd.fileno()).st_size < Ehdr_size:
94      return None
95    with mmap.mmap(fd.fileno(), 0, access=mmap.ACCESS_READ) as m:
96      with memoryview(m) as mv:
97        return handle_elf(mv)
98
99class Symbolizer:
100  def __init__(self, path, binary_prefixes, paths_to_cut):
101    self.__pipe = None
102    self.__path = path
103    self.__binary_prefixes = binary_prefixes
104    self.__paths_to_cut = paths_to_cut
105    self.__log = False
106    self.__warnings = set()
107    self.__index = {}
108    self.__link_prefixes = []
109    self.__html = False
110    self.__last_access_address = None
111    self.__last_access_tag = None
112    self.__tag_dump = []
113    self.__tag_dump_match_idx = None
114    self.__matched_stack_uas = False
115    self.__offsets = []
116
117  def enable_html(self, enable):
118    self.__html = enable
119
120  def enable_logging(self, enable):
121    self.__log = enable
122
123  def maybe_escape(self, text):
124    if self.__html:
125      # We need to manually use &nbsp; for leading spaces, html.escape does
126      # not do that, and HTML ignores them.
127      spaces = 0
128      for i, c in enumerate(text):
129        spaces = i
130        if c != ' ':
131          break
132      text = text[spaces:]
133      return spaces * '&nbsp;' + html.escape(text)
134    return text
135
136  def print(self, line, escape=True):
137    if escape:
138      line = self.maybe_escape(line)
139    if self.__html:
140      line += '<br/>'
141    print(line)
142
143  def read_linkify(self, filename):
144    with open(filename, 'r') as fd:
145      data = json.load(fd)
146    self.__link_prefixes = [(e["prefix"], e["link"]) for e in data]
147
148  def __open_pipe(self):
149    if not self.__pipe:
150      opt = {}
151      if sys.version_info.major > 2:
152        opt['encoding'] = 'utf-8'
153      self.__pipe = subprocess.Popen([self.__path, "--inlining", "--functions"],
154                                     stdin=subprocess.PIPE, stdout=subprocess.PIPE,
155                                     **opt)
156
157  class __EOF(Exception):
158    pass
159
160  def __write(self, s):
161    print(s, file=self.__pipe.stdin)
162    self.__pipe.stdin.flush()
163    if self.__log:
164      print("#>>  |%s|" % (s,), file=sys.stderr)
165
166  def __read(self):
167    s = self.__pipe.stdout.readline().rstrip()
168    if self.__log:
169      print("# << |%s|" % (s,), file=sys.stderr)
170    if s == '':
171      raise Symbolizer.__EOF
172    return s
173
174  def __process_source_path(self, file_name):
175    for path_to_cut in self.__paths_to_cut:
176      file_name = re.sub(".*" + path_to_cut, "", file_name)
177    file_name = re.sub(".*hwasan_[a-z_]*.(cc|h):[0-9]*", "[hwasan_rtl]", file_name)
178    file_name = re.sub(".*asan_[a-z_]*.(cc|h):[0-9]*", "[asan_rtl]", file_name)
179    file_name = re.sub(".*crtstuff.c:0", "???:0", file_name)
180    return file_name
181
182  def __process_binary_name(self, name, buildid):
183    if name.startswith('/'):
184      name = name[1:]
185    if buildid is not None and buildid in self.__index:
186      return self.__index[buildid]
187
188    for p in self.__binary_prefixes:
189      full_path = os.path.join(p, name)
190      if os.path.exists(full_path):
191        return full_path
192      apex_prefix = "apex/com.android."
193      if name.startswith(apex_prefix):
194        full_path = os.path.join(p, "apex/com.google.android." + name[len(apex_prefix):])
195        if os.path.exists(full_path):
196          return full_path
197    # Try stripping extra path components as the last resort.
198    for p in self.__binary_prefixes:
199      full_path = os.path.join(p, os.path.basename(name))
200      if os.path.exists(full_path):
201        return full_path
202    if name not in self.__warnings:
203      print("Could not find symbols for", name, file=sys.stderr)
204      self.__warnings.add(name)
205    return None
206
207  def iter_locals(self, binary, addr, buildid):
208    self.__open_pipe()
209    p = self.__pipe
210    binary = self.__process_binary_name(binary, buildid)
211    if not binary:
212      return
213    self.__write("FRAME %s %s" % (binary, addr))
214    try:
215      while True:
216        function_name = self.__read()
217        local_name = self.__read()
218        file_line = self.__read()
219        extra = self.__read().split()
220
221        file_line = self.__process_source_path(file_line)
222        offset = None if extra[0] == '??' else int(extra[0])
223        size = None if extra[1] == '??' else int(extra[1])
224        tag_offset = None if extra[2] == '??' else int(extra[2])
225        yield (function_name, file_line, local_name, offset, size, tag_offset)
226    except Symbolizer.__EOF:
227      pass
228
229  def iter_call_stack(self, binary, buildid, addr):
230    self.__open_pipe()
231    p = self.__pipe
232    binary = self.__process_binary_name(binary, buildid)
233    if not binary:
234      return
235    self.__write("CODE %s %s" % (binary, addr))
236    try:
237      while True:
238        function_name = self.__read()
239        file_line = self.__read()
240        file_line = self.__process_source_path(file_line)
241        yield (function_name, file_line)
242    except Symbolizer.__EOF:
243      pass
244
245  def maybe_linkify(self, file_line):
246    if not self.__html or not self.__link_prefixes:
247      return file_line
248    filename, line_col = file_line.split(':', 1)
249    if not line_col:
250      line = '0' # simplify the link generation
251    else:
252      line = line_col.split(':')[0]
253    longest_prefix = max((
254      (prefix, link) for prefix, link in self.__link_prefixes
255      if filename.startswith(prefix)),
256      key=lambda x: len(x[0]), default=None)
257    if longest_prefix is None:
258      return file_line
259    else:
260      prefix, link = longest_prefix
261      return '<a href="{}">{}</a>'.format(
262        html.escape(link.format(file=filename[len(prefix):], line=line,
263                                file_line=file_line, prefix=prefix)), file_line)
264
265  def build_index(self):
266    for p in self.__binary_prefixes:
267      for dname, _, fnames in os.walk(p):
268        for fn in fnames:
269          filename = os.path.join(dname, fn)
270          try:
271            bid = get_buildid(filename)
272          except FileNotFoundError:
273            continue
274          except Exception as e:
275            print("Failed to parse {}: {}".format(filename, e), file=sys.stderr)
276            continue
277          if bid is not None:
278            self.__index[bid] = filename
279
280  def symbolize_line(self, line):
281    #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45) (BuildId: 4abce4cd41ea5c2f34753297b7e774d9)
282    match = re.match(r'^(.*?)#([0-9]+)( *)(0x[0-9a-f]*) *\((.*)\+(0x[0-9a-f]+)\)'
283                    r'(?:\s*\(BuildId: ([0-9a-f]+)\))?', line, re.UNICODE)
284    if match:
285      frameno = match.group(2)
286      binary = match.group(5)
287      addr = int(match.group(6), 16)
288      buildid = match.group(7)
289
290      frames = list(self.iter_call_stack(binary, buildid, addr))
291
292      if len(frames) > 0:
293        self.print(
294          self.maybe_escape(
295            "%s#%s%s%s in " % (match.group(1), match.group(2), match.group(3),
296                              frames[0][0])
297          ) + self.maybe_linkify(frames[0][1]),
298          escape=False)
299        for i in range(1, len(frames)):
300          space1 = ' ' * match.end(1)
301          space2 = ' ' * (match.start(4) - match.end(1) - 2)
302          self.print(
303            self.maybe_escape("%s->%s%s in " % (space1, space2, frames[i][0]))
304              + self.maybe_linkify(frames[i][1]), escape=False)
305      else:
306        self.print(line.rstrip())
307    else:
308      self.print(line.rstrip())
309
310  def save_access_address(self, line):
311    match = re.match(r'^(.*?)HWAddressSanitizer: tag-mismatch on address (0x[0-9a-f]+) ', line, re.UNICODE)
312    if match:
313      self.__last_access_address = int(match.group(2), 16)
314    match = re.match(r'^(.*?) of size [0-9]+ at 0x[0-9a-f]* tags: ([0-9a-f]+)/[0-9a-f]+(\([0-9a-f]+\))? \(ptr/mem\)', line, re.UNICODE)
315    if match:
316      self.__last_access_tag = int(match.group(2), 16)
317
318  def process_tag_dump_line(self, line, ignore_tags=False):
319    m = re.match(r'.*?(0x[0-9a-f]+):' + r'([ ]*[\[ ][0-9a-f][0-9a-f]\]?)' * 16, line)
320    if m is None:
321      return False
322    addr = m.group(1)
323    tags = m.group(*range(2, 18))
324    fault = [i for i, x in enumerate(tags) if '[' in x]
325    if fault:
326      self.__tag_dump_match_idx = len(self.__tag_dump) + fault[0]
327    self.__tag_dump.extend(int(x.strip(' [').rstrip('] '), 16) for x in tags)
328    return True
329
330  def finish_tag_dump(self):
331    if self.__matched_stack_uas or self.__tag_dump_match_idx is None:
332      return
333    for offset, size, local in sorted(self.__offsets, key=lambda x: abs(x[0])):
334      idx = self.__tag_dump_match_idx - offset // 16
335      if idx < 0 or idx > len(self.__tag_dump):
336        continue
337      if self.__tag_dump[idx] == self.__last_access_tag:
338        self.print('')
339        self.print('Potentially referenced stack object:')
340        if offset > 0:
341          self.print('  %d bytes after a variable "%s" in stack frame of function "%s"' % (offset - size, local[2], local[0]))
342        if offset < 0:
343          self.print('  %d bytes before a variable "%s" in stack frame of function "%s"' % (-offset, local[2], local[0]))
344        self.print('  at %s' % (local[1],))
345
346  def process_stack_history(self, line, ignore_tags=False):
347    if self.__last_access_address is None or self.__last_access_tag is None:
348      return
349    if re.match(r'Previously allocated frames:', line, re.UNICODE):
350      return True
351    pc_mask = (1 << 48) - 1
352    fp_mask = (1 << 20) - 1
353    # record_addr:0x1234ABCD record:0x1234ABCD (/path/to/binary+0x1234ABCD) (BuildId: 4abce4cd41ea5c2f34753297b7e774d9)
354    match = re.match(r'^(.*?)record_addr:(0x[0-9a-f]+) +record:(0x[0-9a-f]+) +\((.*)\+(0x[0-9a-f]+)\)'
355                    r'(?:\s*\(BuildId: ([0-9a-f]+)\))?', line, re.UNICODE)
356    if match:
357      record_addr = int(match.group(2), 16)
358      record = int(match.group(3), 16)
359      binary = match.group(4)
360      addr = int(match.group(5), 16)
361      buildid = match.group(6)
362      base_tag = (record_addr >> 3) & 0xFF
363      fp = (record >> 48) << 4
364      pc = record & pc_mask
365
366      for local in self.iter_locals(binary, addr, buildid):
367        frame_offset = local[3]
368        size = local[4]
369        if frame_offset is None or size is None:
370          continue
371        obj_offset = (self.__last_access_address & fp_mask) - ((fp & fp_mask) + frame_offset)
372        tag_offset = local[5]
373        if not ignore_tags and (tag_offset is None or base_tag ^ tag_offset != self.__last_access_tag):
374          continue
375        if obj_offset < 0 or obj_offset >= size:
376          self.__offsets.append((obj_offset, size, local))
377          continue
378        self.print('')
379        self.print('Potentially referenced stack object:')
380        self.print('  %d bytes inside a variable "%s" in stack frame of function "%s"' % (obj_offset, local[2], local[0]))
381        self.print('  at %s' % (local[1],))
382        self.__matched_stack_uas = True
383      return True
384    return False
385
386def extract_version(s):
387  idx = s.rfind('-')
388  if idx == -1:
389    return 0
390  x = float(s[idx + 1:])
391  return x
392
393def main():
394  parser = argparse.ArgumentParser()
395  parser.add_argument('-d', action='store_true')
396  parser.add_argument('-v', action='store_true')
397  parser.add_argument('--ignore-tags', action='store_true')
398  parser.add_argument('--symbols', action='append')
399  parser.add_argument('--source', action='append')
400  parser.add_argument('--index', action='store_true')
401  parser.add_argument('--symbolizer')
402  parser.add_argument('--linkify', type=str)
403  parser.add_argument('--html', action='store_true')
404  parser.add_argument('args', nargs=argparse.REMAINDER)
405  args = parser.parse_args()
406
407  # Unstripped binaries location.
408  binary_prefixes = args.symbols or []
409  if not binary_prefixes:
410    if 'ANDROID_PRODUCT_OUT' in os.environ:
411      product_out = os.path.join(os.environ['ANDROID_PRODUCT_OUT'], 'symbols')
412      binary_prefixes.append(product_out)
413    binary_prefixes.append('/')
414
415  for p in binary_prefixes:
416    if not os.path.isdir(p):
417      print("Symbols path does not exist or is not a directory:", p, file=sys.stderr)
418      sys.exit(1)
419
420  # Source location.
421  paths_to_cut = args.source or []
422  if not paths_to_cut:
423    paths_to_cut.append(os.getcwd() + '/')
424    if 'ANDROID_BUILD_TOP' in os.environ:
425      paths_to_cut.append(os.environ['ANDROID_BUILD_TOP'] + '/')
426
427  # llvm-symbolizer binary.
428  # 1. --symbolizer flag
429  # 2. environment variable
430  # 3. unsuffixed binary in the current directory
431  # 4. if inside Android platform, prebuilt binary at a known path
432  # 5. first "llvm-symbolizer", then "llvm-symbolizer-$VER" with the
433  #    highest available version in $PATH
434  symbolizer_path = args.symbolizer
435  if not symbolizer_path:
436    if 'LLVM_SYMBOLIZER_PATH' in os.environ:
437      symbolizer_path = os.environ['LLVM_SYMBOLIZER_PATH']
438    elif 'HWASAN_SYMBOLIZER_PATH' in os.environ:
439      symbolizer_path = os.environ['HWASAN_SYMBOLIZER_PATH']
440
441  if not symbolizer_path:
442    s = os.path.join(os.path.dirname(sys.argv[0]), 'llvm-symbolizer')
443    if os.path.exists(s):
444      symbolizer_path = s
445
446  if not symbolizer_path:
447    if 'ANDROID_BUILD_TOP' in os.environ:
448      s = os.path.join(os.environ['ANDROID_BUILD_TOP'], 'prebuilts/clang/host/linux-x86/llvm-binutils-stable/llvm-symbolizer')
449      if os.path.exists(s):
450        symbolizer_path = s
451
452  if not symbolizer_path:
453    for path in os.environ["PATH"].split(os.pathsep):
454      p = os.path.join(path, 'llvm-symbolizer')
455      if os.path.exists(p):
456        symbolizer_path = p
457        break
458
459  if not symbolizer_path:
460    for path in os.environ["PATH"].split(os.pathsep):
461      candidates = glob.glob(os.path.join(path, 'llvm-symbolizer-*'))
462      if len(candidates) > 0:
463        candidates.sort(key = extract_version, reverse = True)
464        symbolizer_path = candidates[0]
465        break
466
467  if not os.path.exists(symbolizer_path):
468    print("Symbolizer path does not exist:", symbolizer_path, file=sys.stderr)
469    sys.exit(1)
470
471  if args.v:
472    print("Looking for symbols in:")
473    for s in binary_prefixes:
474      print("  %s" % (s,))
475    print("Stripping source path prefixes:")
476    for s in paths_to_cut:
477      print("  %s" % (s,))
478    print("Using llvm-symbolizer binary in:\n  %s" % (symbolizer_path,))
479    print()
480
481  symbolizer = Symbolizer(symbolizer_path, binary_prefixes, paths_to_cut)
482  symbolizer.enable_html(args.html)
483  symbolizer.enable_logging(args.d)
484  if args.index:
485    symbolizer.build_index()
486
487  if args.linkify:
488    if not args.html:
489      print('Need --html to --linkify', file=sys.stderr)
490      sys.exit(1)
491    symbolizer.read_linkify(args.linkify)
492
493  tag_dump = False
494  for line in sys.stdin:
495    if sys.version_info.major < 3:
496      line = line.decode('utf-8')
497    if tag_dump:
498      tag_dump = symbolizer.process_tag_dump_line(line)
499      if tag_dump:
500        continue
501      symbolizer.finish_tag_dump()
502    if 'Memory tags around the buggy address' in line:
503      tag_dump = True
504
505    symbolizer.save_access_address(line)
506    if symbolizer.process_stack_history(line, ignore_tags=args.ignore_tags):
507      continue
508    symbolizer.symbolize_line(line)
509
510
511if __name__ == '__main__':
512  main()
513