1#!/usr/bin/env python3 2#===- lib/hwasan/scripts/hwasan_symbolize ----------------------------------===# 3# 4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5# See https:#llvm.org/LICENSE.txt for license information. 6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7# 8#===------------------------------------------------------------------------===# 9# 10# HWAddressSanitizer offline symbolization script. 11# 12#===------------------------------------------------------------------------===# 13 14from __future__ import print_function 15from __future__ import unicode_literals 16 17import argparse 18import glob 19import html 20import json 21import mmap 22import os 23import re 24import struct 25import subprocess 26import sys 27 28if sys.version_info.major < 3: 29 # Simulate Python 3.x behaviour of defaulting to UTF-8 for print. This is 30 # important in case any symbols are non-ASCII. 31 import codecs 32 sys.stdout = codecs.getwriter("utf-8")(sys.stdout) 33 34# Below, a parser for a subset of ELF. It only supports 64 bit, little-endian, 35# and only parses what is necessary to find the build ids. It uses a memoryview 36# into an mmap to avoid copying. 37Ehdr_size = 64 38e_shnum_offset = 60 39e_shoff_offset = 40 40 41Shdr_size = 64 42sh_type_offset = 4 43sh_offset_offset = 24 44sh_size_offset = 32 45SHT_NOTE = 7 46 47Nhdr_size = 12 48NT_GNU_BUILD_ID = 3 49 50def align_up(size, alignment): 51 return (size + alignment - 1) & ~(alignment - 1) 52 53def handle_Nhdr(mv, sh_size): 54 offset = 0 55 while offset < sh_size: 56 n_namesz, n_descsz, n_type = struct.unpack_from('<III', buffer=mv, 57 offset=offset) 58 if (n_type == NT_GNU_BUILD_ID and n_namesz == 4 and 59 mv[offset + Nhdr_size: offset + Nhdr_size + 4] == b"GNU\x00"): 60 value = mv[offset + Nhdr_size + 4: offset + Nhdr_size + 4 + n_descsz] 61 return value.hex() 62 offset += Nhdr_size + align_up(n_namesz, 4) + align_up(n_descsz, 4) 63 return None 64 65def handle_Shdr(mv): 66 sh_type, = struct.unpack_from('<I', buffer=mv, offset=sh_type_offset) 67 if sh_type != SHT_NOTE: 68 return None, None 69 sh_offset, = struct.unpack_from('<Q', buffer=mv, offset=sh_offset_offset) 70 sh_size, = struct.unpack_from('<Q', buffer=mv, offset=sh_size_offset) 71 return sh_offset, sh_size 72 73def handle_elf(mv): 74 # \x02 is ELFCLASS64, \x01 is ELFDATA2LSB. HWASan currently only works on 75 # 64-bit little endian platforms (x86_64 and ARM64). If this changes, we will 76 # have to extend the parsing code. 77 if mv[:6] != b'\x7fELF\x02\x01': 78 return None 79 e_shnum, = struct.unpack_from('<H', buffer=mv, offset=e_shnum_offset) 80 e_shoff, = struct.unpack_from('<Q', buffer=mv, offset=e_shoff_offset) 81 for i in range(0, e_shnum): 82 start = e_shoff + i * Shdr_size 83 sh_offset, sh_size = handle_Shdr(mv[start: start + Shdr_size]) 84 if sh_offset is None: 85 continue 86 note_hdr = mv[sh_offset: sh_offset + sh_size] 87 result = handle_Nhdr(note_hdr, sh_size) 88 if result is not None: 89 return result 90 91def get_buildid(filename): 92 with open(filename, "r") as fd: 93 if os.fstat(fd.fileno()).st_size < Ehdr_size: 94 return None 95 with mmap.mmap(fd.fileno(), 0, access=mmap.ACCESS_READ) as m: 96 with memoryview(m) as mv: 97 return handle_elf(mv) 98 99class Symbolizer: 100 def __init__(self, path, binary_prefixes, paths_to_cut): 101 self.__pipe = None 102 self.__path = path 103 self.__binary_prefixes = binary_prefixes 104 self.__paths_to_cut = paths_to_cut 105 self.__log = False 106 self.__warnings = set() 107 self.__index = {} 108 self.__link_prefixes = [] 109 self.__html = False 110 self.__last_access_address = None 111 self.__last_access_tag = None 112 self.__tag_dump = [] 113 self.__tag_dump_match_idx = None 114 self.__matched_stack_uas = False 115 self.__offsets = [] 116 117 def enable_html(self, enable): 118 self.__html = enable 119 120 def enable_logging(self, enable): 121 self.__log = enable 122 123 def maybe_escape(self, text): 124 if self.__html: 125 # We need to manually use for leading spaces, html.escape does 126 # not do that, and HTML ignores them. 127 spaces = 0 128 for i, c in enumerate(text): 129 spaces = i 130 if c != ' ': 131 break 132 text = text[spaces:] 133 return spaces * ' ' + html.escape(text) 134 return text 135 136 def print(self, line, escape=True): 137 if escape: 138 line = self.maybe_escape(line) 139 if self.__html: 140 line += '<br/>' 141 print(line) 142 143 def read_linkify(self, filename): 144 with open(filename, 'r') as fd: 145 data = json.load(fd) 146 self.__link_prefixes = [(e["prefix"], e["link"]) for e in data] 147 148 def __open_pipe(self): 149 if not self.__pipe: 150 opt = {} 151 if sys.version_info.major > 2: 152 opt['encoding'] = 'utf-8' 153 self.__pipe = subprocess.Popen([self.__path, "--inlining", "--functions"], 154 stdin=subprocess.PIPE, stdout=subprocess.PIPE, 155 **opt) 156 157 class __EOF(Exception): 158 pass 159 160 def __write(self, s): 161 print(s, file=self.__pipe.stdin) 162 self.__pipe.stdin.flush() 163 if self.__log: 164 print("#>> |%s|" % (s,), file=sys.stderr) 165 166 def __read(self): 167 s = self.__pipe.stdout.readline().rstrip() 168 if self.__log: 169 print("# << |%s|" % (s,), file=sys.stderr) 170 if s == '': 171 raise Symbolizer.__EOF 172 return s 173 174 def __process_source_path(self, file_name): 175 for path_to_cut in self.__paths_to_cut: 176 file_name = re.sub(".*" + path_to_cut, "", file_name) 177 file_name = re.sub(".*hwasan_[a-z_]*.(cc|h):[0-9]*", "[hwasan_rtl]", file_name) 178 file_name = re.sub(".*asan_[a-z_]*.(cc|h):[0-9]*", "[asan_rtl]", file_name) 179 file_name = re.sub(".*crtstuff.c:0", "???:0", file_name) 180 return file_name 181 182 def __process_binary_name(self, name, buildid): 183 if name.startswith('/'): 184 name = name[1:] 185 if buildid is not None and buildid in self.__index: 186 return self.__index[buildid] 187 188 for p in self.__binary_prefixes: 189 full_path = os.path.join(p, name) 190 if os.path.exists(full_path): 191 return full_path 192 apex_prefix = "apex/com.android." 193 if name.startswith(apex_prefix): 194 full_path = os.path.join(p, "apex/com.google.android." + name[len(apex_prefix):]) 195 if os.path.exists(full_path): 196 return full_path 197 # Try stripping extra path components as the last resort. 198 for p in self.__binary_prefixes: 199 full_path = os.path.join(p, os.path.basename(name)) 200 if os.path.exists(full_path): 201 return full_path 202 if name not in self.__warnings: 203 print("Could not find symbols for", name, file=sys.stderr) 204 self.__warnings.add(name) 205 return None 206 207 def iter_locals(self, binary, addr, buildid): 208 self.__open_pipe() 209 p = self.__pipe 210 binary = self.__process_binary_name(binary, buildid) 211 if not binary: 212 return 213 self.__write("FRAME %s %s" % (binary, addr)) 214 try: 215 while True: 216 function_name = self.__read() 217 local_name = self.__read() 218 file_line = self.__read() 219 extra = self.__read().split() 220 221 file_line = self.__process_source_path(file_line) 222 offset = None if extra[0] == '??' else int(extra[0]) 223 size = None if extra[1] == '??' else int(extra[1]) 224 tag_offset = None if extra[2] == '??' else int(extra[2]) 225 yield (function_name, file_line, local_name, offset, size, tag_offset) 226 except Symbolizer.__EOF: 227 pass 228 229 def iter_call_stack(self, binary, buildid, addr): 230 self.__open_pipe() 231 p = self.__pipe 232 binary = self.__process_binary_name(binary, buildid) 233 if not binary: 234 return 235 self.__write("CODE %s %s" % (binary, addr)) 236 try: 237 while True: 238 function_name = self.__read() 239 file_line = self.__read() 240 file_line = self.__process_source_path(file_line) 241 yield (function_name, file_line) 242 except Symbolizer.__EOF: 243 pass 244 245 def maybe_linkify(self, file_line): 246 if not self.__html or not self.__link_prefixes: 247 return file_line 248 filename, line_col = file_line.split(':', 1) 249 if not line_col: 250 line = '0' # simplify the link generation 251 else: 252 line = line_col.split(':')[0] 253 longest_prefix = max(( 254 (prefix, link) for prefix, link in self.__link_prefixes 255 if filename.startswith(prefix)), 256 key=lambda x: len(x[0]), default=None) 257 if longest_prefix is None: 258 return file_line 259 else: 260 prefix, link = longest_prefix 261 return '<a href="{}">{}</a>'.format( 262 html.escape(link.format(file=filename[len(prefix):], line=line, 263 file_line=file_line, prefix=prefix)), file_line) 264 265 def build_index(self): 266 for p in self.__binary_prefixes: 267 for dname, _, fnames in os.walk(p): 268 for fn in fnames: 269 filename = os.path.join(dname, fn) 270 try: 271 bid = get_buildid(filename) 272 except FileNotFoundError: 273 continue 274 except Exception as e: 275 print("Failed to parse {}: {}".format(filename, e), file=sys.stderr) 276 continue 277 if bid is not None: 278 self.__index[bid] = filename 279 280 def symbolize_line(self, line): 281 #0 0x7f6e35cf2e45 (/blah/foo.so+0x11fe45) (BuildId: 4abce4cd41ea5c2f34753297b7e774d9) 282 match = re.match(r'^(.*?)#([0-9]+)( *)(0x[0-9a-f]*) *\((.*)\+(0x[0-9a-f]+)\)' 283 r'(?:\s*\(BuildId: ([0-9a-f]+)\))?', line, re.UNICODE) 284 if match: 285 frameno = match.group(2) 286 binary = match.group(5) 287 addr = int(match.group(6), 16) 288 buildid = match.group(7) 289 290 frames = list(self.iter_call_stack(binary, buildid, addr)) 291 292 if len(frames) > 0: 293 self.print( 294 self.maybe_escape( 295 "%s#%s%s%s in " % (match.group(1), match.group(2), match.group(3), 296 frames[0][0]) 297 ) + self.maybe_linkify(frames[0][1]), 298 escape=False) 299 for i in range(1, len(frames)): 300 space1 = ' ' * match.end(1) 301 space2 = ' ' * (match.start(4) - match.end(1) - 2) 302 self.print( 303 self.maybe_escape("%s->%s%s in " % (space1, space2, frames[i][0])) 304 + self.maybe_linkify(frames[i][1]), escape=False) 305 else: 306 self.print(line.rstrip()) 307 else: 308 self.print(line.rstrip()) 309 310 def save_access_address(self, line): 311 match = re.match(r'^(.*?)HWAddressSanitizer: tag-mismatch on address (0x[0-9a-f]+) ', line, re.UNICODE) 312 if match: 313 self.__last_access_address = int(match.group(2), 16) 314 match = re.match(r'^(.*?) of size [0-9]+ at 0x[0-9a-f]* tags: ([0-9a-f]+)/[0-9a-f]+(\([0-9a-f]+\))? \(ptr/mem\)', line, re.UNICODE) 315 if match: 316 self.__last_access_tag = int(match.group(2), 16) 317 318 def process_tag_dump_line(self, line, ignore_tags=False): 319 m = re.match(r'.*?(0x[0-9a-f]+):' + r'([ ]*[\[ ][0-9a-f][0-9a-f]\]?)' * 16, line) 320 if m is None: 321 return False 322 addr = m.group(1) 323 tags = m.group(*range(2, 18)) 324 fault = [i for i, x in enumerate(tags) if '[' in x] 325 if fault: 326 self.__tag_dump_match_idx = len(self.__tag_dump) + fault[0] 327 self.__tag_dump.extend(int(x.strip(' [').rstrip('] '), 16) for x in tags) 328 return True 329 330 def finish_tag_dump(self): 331 if self.__matched_stack_uas or self.__tag_dump_match_idx is None: 332 return 333 for offset, size, local in sorted(self.__offsets, key=lambda x: abs(x[0])): 334 idx = self.__tag_dump_match_idx - offset // 16 335 if idx < 0 or idx > len(self.__tag_dump): 336 continue 337 if self.__tag_dump[idx] == self.__last_access_tag: 338 self.print('') 339 self.print('Potentially referenced stack object:') 340 if offset > 0: 341 self.print(' %d bytes after a variable "%s" in stack frame of function "%s"' % (offset - size, local[2], local[0])) 342 if offset < 0: 343 self.print(' %d bytes before a variable "%s" in stack frame of function "%s"' % (-offset, local[2], local[0])) 344 self.print(' at %s' % (local[1],)) 345 346 def process_stack_history(self, line, ignore_tags=False): 347 if self.__last_access_address is None or self.__last_access_tag is None: 348 return 349 if re.match(r'Previously allocated frames:', line, re.UNICODE): 350 return True 351 pc_mask = (1 << 48) - 1 352 fp_mask = (1 << 20) - 1 353 # record_addr:0x1234ABCD record:0x1234ABCD (/path/to/binary+0x1234ABCD) (BuildId: 4abce4cd41ea5c2f34753297b7e774d9) 354 match = re.match(r'^(.*?)record_addr:(0x[0-9a-f]+) +record:(0x[0-9a-f]+) +\((.*)\+(0x[0-9a-f]+)\)' 355 r'(?:\s*\(BuildId: ([0-9a-f]+)\))?', line, re.UNICODE) 356 if match: 357 record_addr = int(match.group(2), 16) 358 record = int(match.group(3), 16) 359 binary = match.group(4) 360 addr = int(match.group(5), 16) 361 buildid = match.group(6) 362 base_tag = (record_addr >> 3) & 0xFF 363 fp = (record >> 48) << 4 364 pc = record & pc_mask 365 366 for local in self.iter_locals(binary, addr, buildid): 367 frame_offset = local[3] 368 size = local[4] 369 if frame_offset is None or size is None: 370 continue 371 obj_offset = (self.__last_access_address & fp_mask) - ((fp & fp_mask) + frame_offset) 372 tag_offset = local[5] 373 if not ignore_tags and (tag_offset is None or base_tag ^ tag_offset != self.__last_access_tag): 374 continue 375 if obj_offset < 0 or obj_offset >= size: 376 self.__offsets.append((obj_offset, size, local)) 377 continue 378 self.print('') 379 self.print('Potentially referenced stack object:') 380 self.print(' %d bytes inside a variable "%s" in stack frame of function "%s"' % (obj_offset, local[2], local[0])) 381 self.print(' at %s' % (local[1],)) 382 self.__matched_stack_uas = True 383 return True 384 return False 385 386def extract_version(s): 387 idx = s.rfind('-') 388 if idx == -1: 389 return 0 390 x = float(s[idx + 1:]) 391 return x 392 393def main(): 394 parser = argparse.ArgumentParser() 395 parser.add_argument('-d', action='store_true') 396 parser.add_argument('-v', action='store_true') 397 parser.add_argument('--ignore-tags', action='store_true') 398 parser.add_argument('--symbols', action='append') 399 parser.add_argument('--source', action='append') 400 parser.add_argument('--index', action='store_true') 401 parser.add_argument('--symbolizer') 402 parser.add_argument('--linkify', type=str) 403 parser.add_argument('--html', action='store_true') 404 parser.add_argument('args', nargs=argparse.REMAINDER) 405 args = parser.parse_args() 406 407 # Unstripped binaries location. 408 binary_prefixes = args.symbols or [] 409 if not binary_prefixes: 410 if 'ANDROID_PRODUCT_OUT' in os.environ: 411 product_out = os.path.join(os.environ['ANDROID_PRODUCT_OUT'], 'symbols') 412 binary_prefixes.append(product_out) 413 binary_prefixes.append('/') 414 415 for p in binary_prefixes: 416 if not os.path.isdir(p): 417 print("Symbols path does not exist or is not a directory:", p, file=sys.stderr) 418 sys.exit(1) 419 420 # Source location. 421 paths_to_cut = args.source or [] 422 if not paths_to_cut: 423 paths_to_cut.append(os.getcwd() + '/') 424 if 'ANDROID_BUILD_TOP' in os.environ: 425 paths_to_cut.append(os.environ['ANDROID_BUILD_TOP'] + '/') 426 427 # llvm-symbolizer binary. 428 # 1. --symbolizer flag 429 # 2. environment variable 430 # 3. unsuffixed binary in the current directory 431 # 4. if inside Android platform, prebuilt binary at a known path 432 # 5. first "llvm-symbolizer", then "llvm-symbolizer-$VER" with the 433 # highest available version in $PATH 434 symbolizer_path = args.symbolizer 435 if not symbolizer_path: 436 if 'LLVM_SYMBOLIZER_PATH' in os.environ: 437 symbolizer_path = os.environ['LLVM_SYMBOLIZER_PATH'] 438 elif 'HWASAN_SYMBOLIZER_PATH' in os.environ: 439 symbolizer_path = os.environ['HWASAN_SYMBOLIZER_PATH'] 440 441 if not symbolizer_path: 442 s = os.path.join(os.path.dirname(sys.argv[0]), 'llvm-symbolizer') 443 if os.path.exists(s): 444 symbolizer_path = s 445 446 if not symbolizer_path: 447 if 'ANDROID_BUILD_TOP' in os.environ: 448 s = os.path.join(os.environ['ANDROID_BUILD_TOP'], 'prebuilts/clang/host/linux-x86/llvm-binutils-stable/llvm-symbolizer') 449 if os.path.exists(s): 450 symbolizer_path = s 451 452 if not symbolizer_path: 453 for path in os.environ["PATH"].split(os.pathsep): 454 p = os.path.join(path, 'llvm-symbolizer') 455 if os.path.exists(p): 456 symbolizer_path = p 457 break 458 459 if not symbolizer_path: 460 for path in os.environ["PATH"].split(os.pathsep): 461 candidates = glob.glob(os.path.join(path, 'llvm-symbolizer-*')) 462 if len(candidates) > 0: 463 candidates.sort(key = extract_version, reverse = True) 464 symbolizer_path = candidates[0] 465 break 466 467 if not os.path.exists(symbolizer_path): 468 print("Symbolizer path does not exist:", symbolizer_path, file=sys.stderr) 469 sys.exit(1) 470 471 if args.v: 472 print("Looking for symbols in:") 473 for s in binary_prefixes: 474 print(" %s" % (s,)) 475 print("Stripping source path prefixes:") 476 for s in paths_to_cut: 477 print(" %s" % (s,)) 478 print("Using llvm-symbolizer binary in:\n %s" % (symbolizer_path,)) 479 print() 480 481 symbolizer = Symbolizer(symbolizer_path, binary_prefixes, paths_to_cut) 482 symbolizer.enable_html(args.html) 483 symbolizer.enable_logging(args.d) 484 if args.index: 485 symbolizer.build_index() 486 487 if args.linkify: 488 if not args.html: 489 print('Need --html to --linkify', file=sys.stderr) 490 sys.exit(1) 491 symbolizer.read_linkify(args.linkify) 492 493 tag_dump = False 494 for line in sys.stdin: 495 if sys.version_info.major < 3: 496 line = line.decode('utf-8') 497 if tag_dump: 498 tag_dump = symbolizer.process_tag_dump_line(line) 499 if tag_dump: 500 continue 501 symbolizer.finish_tag_dump() 502 if 'Memory tags around the buggy address' in line: 503 tag_dump = True 504 505 symbolizer.save_access_address(line) 506 if symbolizer.process_stack_history(line, ignore_tags=args.ignore_tags): 507 continue 508 symbolizer.symbolize_line(line) 509 510 511if __name__ == '__main__': 512 main() 513