1#!/usr/bin/env python 2 3"""A tool for extracting a list of symbols to export 4 5When exporting symbols from a dll or exe we either need to mark the symbols in 6the source code as __declspec(dllexport) or supply a list of symbols to the 7linker. This program automates the latter by inspecting the symbol tables of a 8list of link inputs and deciding which of those symbols need to be exported. 9 10We can't just export all the defined symbols, as there's a limit of 65535 11exported symbols and in clang we go way over that, particularly in a debug 12build. Therefore a large part of the work is pruning symbols either which can't 13be imported, or which we think are things that have definitions in public header 14files (i.e. template instantiations) and we would get defined in the thing 15importing these symbols anyway. 16""" 17 18from __future__ import print_function 19import sys 20import re 21import os 22import subprocess 23import multiprocessing 24import argparse 25import platform 26 27# Define a function which extracts a list of pairs of (symbols, is_def) from a 28# library using llvm-nm becuase it can work both with regular and bitcode files. 29# We use subprocess.Popen and yield a symbol at a time instead of using 30# subprocess.check_output and returning a list as, especially on Windows, waiting 31# for the entire output to be ready can take a significant amount of time. 32def nm_get_symbols(tool, lib): 33 # '-P' means the output is in portable format, 34 # '-g' means we only get global symbols, 35 # '-Xany' enforce handling both 32- and 64-bit objects on AIX, 36 # '--no-demangle' ensure that C++ symbol names are not demangled; note 37 # that llvm-nm do not demangle by default, but the system nm on AIX does 38 # that, so the behavior may change in the future, 39 # '-p' do not waste time sorting the symbols. 40 cmd = [tool, "-P", "-g", "-Xany", "--no-demangle", "-p"] 41 process = subprocess.Popen( 42 cmd + [lib], 43 bufsize=1, 44 stdout=subprocess.PIPE, 45 stdin=subprocess.PIPE, 46 universal_newlines=True, 47 ) 48 process.stdin.close() 49 for line in process.stdout: 50 # Look for external symbols that are defined in some section 51 # The POSIX format is: 52 # name type value size 53 # The -P flag displays the size field for symbols only when applicable, 54 # so the last field is optional. There's no space after the value field, 55 # but \s+ match newline also, so \s+\S* will match the optional size field. 56 match = re.match(r"^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line) 57 if match: 58 yield (match.group(1), True) 59 # Look for undefined symbols, which have type U and may or may not 60 # (depending on which nm is being used) have value and size. 61 match = re.match(r"^(\S+)\s+U\s+(\S+\s+\S*)?$", line) 62 if match: 63 yield (match.group(1), False) 64 process.wait() 65 66 67# Define a function which determines if the target is 32-bit Windows (as that's 68# where calling convention name decoration happens). 69def readobj_is_32bit_windows(tool, lib): 70 output = subprocess.check_output( 71 [tool, "--file-header", lib], universal_newlines=True 72 ) 73 for line in output.splitlines(): 74 match = re.match(r"Format: (\S+)", line) 75 if match: 76 return match.group(1) == "COFF-i386" 77 return False 78 79 80# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the 81# identifier/type mangling we can decide which symbols could possibly be 82# required and which we can discard. 83def should_keep_microsoft_symbol(symbol, calling_convention_decoration): 84 # Keep unmangled (i.e. extern "C") names 85 if not "?" in symbol: 86 if calling_convention_decoration: 87 # Remove calling convention decoration from names 88 match = re.match(r"[_@]([^@]+)", symbol) 89 if match: 90 symbol = match.group(1) 91 # Discard floating point/SIMD constants. 92 if symbol.startswith(("__xmm@", "__ymm@", "__real@")): 93 return None 94 return symbol 95 # Deleting destructors start with ?_G or ?_E and can be discarded because 96 # link.exe gives you a warning telling you they can't be exported if you 97 # don't 98 elif symbol.startswith("??_G") or symbol.startswith("??_E"): 99 return None 100 # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol 101 # that mentions an anonymous namespace can be discarded, as the anonymous 102 # namespace doesn't exist outside of that translation unit. 103 elif re.search(r"\?A(0x\w+)?@", symbol): 104 return None 105 # Skip X86GenMnemonicTables functions, they are not exposed from llvm/include/. 106 elif re.match(r"\?is[A-Z0-9]*@X86@llvm", symbol): 107 return None 108 # Keep mangled llvm:: and clang:: function symbols. How we detect these is a 109 # bit of a mess and imprecise, but that avoids having to completely demangle 110 # the symbol name. The outermost namespace is at the end of the identifier 111 # mangling, and the identifier mangling is followed by the type mangling, so 112 # we look for (llvm|clang)@@ followed by something that looks like a 113 # function type mangling. To spot a function type we use (this is derived 114 # from clang/lib/AST/MicrosoftMangle.cpp): 115 # <function-type> ::= <function-class> <this-cvr-qualifiers> 116 # <calling-convention> <return-type> 117 # <argument-list> <throw-spec> 118 # <function-class> ::= [A-Z] 119 # <this-cvr-qualifiers> ::= [A-Z0-9_]* 120 # <calling-convention> ::= [A-JQ] 121 # <return-type> ::= .+ 122 # <argument-list> ::= X (void) 123 # ::= .+@ (list of types) 124 # ::= .*Z (list of types, varargs) 125 # <throw-spec> ::= exceptions are not allowed 126 elif re.search(r"(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol): 127 return symbol 128 return None 129 130 131# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We 132# demangle the identifier mangling to identify symbols that can be safely 133# discarded. 134def should_keep_itanium_symbol(symbol, calling_convention_decoration): 135 # Start by removing any calling convention decoration (which we expect to 136 # see on all symbols, even mangled C++ symbols) 137 if calling_convention_decoration and symbol.startswith("_"): 138 symbol = symbol[1:] 139 # Keep unmangled names 140 if not symbol.startswith("_") and not symbol.startswith("."): 141 return symbol 142 # Discard manglings that aren't nested names 143 match = re.match(r"\.?_Z(T[VTIS])?(N.+)", symbol) 144 if not match: 145 return None 146 # Demangle the name. If the name is too complex then we don't need to keep 147 # it, but it the demangling fails then keep the symbol just in case. 148 try: 149 names, _ = parse_itanium_nested_name(match.group(2)) 150 except TooComplexName: 151 return None 152 if not names: 153 return symbol 154 # Keep llvm:: and clang:: names 155 elif names[0][0] == "4llvm" or names[0][0] == "5clang": 156 return symbol 157 # Discard everything else 158 else: 159 return None 160 161 162# Certain kinds of complex manglings we assume cannot be part of a public 163# interface, and we handle them by raising an exception. 164class TooComplexName(Exception): 165 pass 166 167 168# Parse an itanium mangled name from the start of a string and return a 169# (name, rest of string) pair. 170def parse_itanium_name(arg): 171 # Check for a normal name 172 match = re.match(r"(\d+)(.+)", arg) 173 if match: 174 n = int(match.group(1)) 175 name = match.group(1) + match.group(2)[:n] 176 rest = match.group(2)[n:] 177 return name, rest 178 # Check for constructor/destructor names 179 match = re.match(r"([CD][123])(.+)", arg) 180 if match: 181 return match.group(1), match.group(2) 182 # Assume that a sequence of characters that doesn't end a nesting is an 183 # operator (this is very imprecise, but appears to be good enough) 184 match = re.match(r"([^E]+)(.+)", arg) 185 if match: 186 return match.group(1), match.group(2) 187 # Anything else: we can't handle it 188 return None, arg 189 190 191# Parse an itanium mangled template argument list from the start of a string 192# and throw it away, returning the rest of the string. 193def skip_itanium_template(arg): 194 # A template argument list starts with I 195 assert arg.startswith("I"), arg 196 tmp = arg[1:] 197 while tmp: 198 # Check for names 199 match = re.match(r"(\d+)(.+)", tmp) 200 if match: 201 n = int(match.group(1)) 202 tmp = match.group(2)[n:] 203 continue 204 # Check for substitutions 205 match = re.match(r"S[A-Z0-9]*_(.+)", tmp) 206 if match: 207 tmp = match.group(1) 208 # Start of a template 209 elif tmp.startswith("I"): 210 tmp = skip_itanium_template(tmp) 211 # Start of a nested name 212 elif tmp.startswith("N"): 213 _, tmp = parse_itanium_nested_name(tmp) 214 # Start of an expression: assume that it's too complicated 215 elif tmp.startswith("L") or tmp.startswith("X"): 216 raise TooComplexName 217 # End of the template 218 elif tmp.startswith("E"): 219 return tmp[1:] 220 # Something else: probably a type, skip it 221 else: 222 tmp = tmp[1:] 223 return None 224 225 226# Parse an itanium mangled nested name and transform it into a list of pairs of 227# (name, is_template), returning (list, rest of string). 228def parse_itanium_nested_name(arg): 229 # A nested name starts with N 230 assert arg.startswith("N"), arg 231 ret = [] 232 233 # Skip past the N, and possibly a substitution 234 match = re.match(r"NS[A-Z0-9]*_(.+)", arg) 235 if match: 236 tmp = match.group(1) 237 else: 238 tmp = arg[1:] 239 240 # Skip past CV-qualifiers and ref qualifiers 241 match = re.match(r"[rVKRO]*(.+)", tmp) 242 if match: 243 tmp = match.group(1) 244 245 # Repeatedly parse names from the string until we reach the end of the 246 # nested name 247 while tmp: 248 # An E ends the nested name 249 if tmp.startswith("E"): 250 return ret, tmp[1:] 251 # Parse a name 252 name_part, tmp = parse_itanium_name(tmp) 253 if not name_part: 254 # If we failed then we don't know how to demangle this 255 return None, None 256 is_template = False 257 # If this name is a template record that, then skip the template 258 # arguments 259 if tmp.startswith("I"): 260 tmp = skip_itanium_template(tmp) 261 is_template = True 262 # Add the name to the list 263 ret.append((name_part, is_template)) 264 265 # If we get here then something went wrong 266 return None, None 267 268 269# Parse a microsoft mangled symbol and return a list of pairs of 270# (name, is_template). This is very rudimentary and does just enough 271# in order to determine if the first or second component is a template. 272def parse_microsoft_mangling(arg): 273 # If the name doesn't start with ? this isn't a mangled name 274 if not arg.startswith("?"): 275 return [(arg, False)] 276 arg = arg[1:] 277 components = [] 278 while len(arg) > 0: 279 # If we see an empty component we've reached the end 280 if arg.startswith("@"): 281 return components 282 # Check for a simple name 283 match = re.match(r"(\w+)@(.+)", arg) 284 if match: 285 components.append((match.group(1), False)) 286 arg = match.group(2) 287 continue 288 # Check for a special function name 289 match = re.match(r"(\?_?\w)(.+)", arg) 290 if match: 291 components.append((match.group(1), False)) 292 arg = match.group(2) 293 continue 294 # Check for a template name 295 match = re.match(r"\?\$(\w+)@[^@]+@(.+)", arg) 296 if match: 297 components.append((match.group(1), True)) 298 arg = match.group(2) 299 continue 300 # Some other kind of name that we can't handle 301 components.append((arg, False)) 302 return components 303 return components 304 305 306def extract_symbols(arg): 307 llvm_nm_path, should_keep_symbol, calling_convention_decoration, lib = arg 308 symbol_defs = dict() 309 symbol_refs = set() 310 for (symbol, is_def) in nm_get_symbols(llvm_nm_path, lib): 311 symbol = should_keep_symbol(symbol, calling_convention_decoration) 312 if symbol: 313 if is_def: 314 symbol_defs[symbol] = 1 + symbol_defs.setdefault(symbol, 0) 315 else: 316 symbol_refs.add(symbol) 317 return (symbol_defs, symbol_refs) 318 319 320def get_template_name(sym, mangling): 321 # Parse the mangling into a list of (name, is_template) 322 try: 323 if mangling == "microsoft": 324 names = parse_microsoft_mangling(sym) 325 else: 326 match = re.match(r"\.?_Z(T[VTIS])?(N.+)", sym) 327 if match: 328 names, _ = parse_itanium_nested_name(match.group(2)) 329 else: 330 names = None 331 except TooComplexName: 332 return None 333 334 if not names: 335 return None 336 337 # If any component is a template then return it 338 for name, is_template in names: 339 if is_template: 340 return name 341 342 # Not a template 343 return None 344 345 346def parse_tool_path(parser, tool, val): 347 try: 348 # Close std streams as we don't want any output and we don't 349 # want the process to wait for something on stdin. 350 p = subprocess.Popen( 351 [val], 352 stdout=subprocess.PIPE, 353 stderr=subprocess.PIPE, 354 stdin=subprocess.PIPE, 355 universal_newlines=True, 356 ) 357 p.stdout.close() 358 p.stderr.close() 359 p.stdin.close() 360 p.wait() 361 return val 362 except Exception: 363 parser.error(f"Invalid path for {tool}") 364 365 366if __name__ == "__main__": 367 parser = argparse.ArgumentParser( 368 description="Extract symbols to export from libraries" 369 ) 370 parser.add_argument( 371 "--mangling", 372 choices=["itanium", "microsoft"], 373 required=True, 374 help="expected symbol mangling scheme", 375 ) 376 parser.add_argument( 377 "--nm", 378 metavar="path", 379 type=lambda x: parse_tool_path(parser, "nm", x), 380 help="path to the llvm-nm executable", 381 ) 382 parser.add_argument( 383 "--readobj", 384 metavar="path", 385 type=lambda x: parse_tool_path(parser, "readobj", x), 386 help="path to the llvm-readobj executable", 387 ) 388 parser.add_argument( 389 "libs", 390 metavar="lib", 391 type=str, 392 nargs="+", 393 help="libraries to extract symbols from", 394 ) 395 parser.add_argument("-o", metavar="file", type=str, help="output to file") 396 args = parser.parse_args() 397 398 # How we determine which symbols to keep and which to discard depends on 399 # the mangling scheme 400 if args.mangling == "microsoft": 401 should_keep_symbol = should_keep_microsoft_symbol 402 else: 403 should_keep_symbol = should_keep_itanium_symbol 404 405 # Get the list of libraries to extract symbols from 406 libs = list() 407 for lib in args.libs: 408 # When invoked by cmake the arguments are the cmake target names of the 409 # libraries, so we need to add .lib/.a to the end and maybe lib to the 410 # start to get the filename. Also allow objects. 411 suffixes = [".lib", ".a", ".obj", ".o"] 412 if not any([lib.endswith(s) for s in suffixes]): 413 for s in suffixes: 414 if os.path.exists(lib + s): 415 lib = lib + s 416 break 417 if os.path.exists("lib" + lib + s): 418 lib = "lib" + lib + s 419 break 420 if not any([lib.endswith(s) for s in suffixes]): 421 print("Don't know what to do with argument " + lib, file=sys.stderr) 422 exit(1) 423 libs.append(lib) 424 425 # Check if calling convention decoration is used by inspecting the first 426 # library in the list 427 calling_convention_decoration = readobj_is_32bit_windows(args.readobj, libs[0]) 428 429 # Extract symbols from libraries in parallel. This is a huge time saver when 430 # doing a debug build, as there are hundreds of thousands of symbols in each 431 # library. 432 # FIXME: On AIX, the default pool size can be too big for a logical 433 # partition's allocated memory, and can lead to an out of memory 434 # IO error. We are setting the pool size to 8 to avoid such 435 # errors at the moment, and will look for a graceful solution later. 436 pool = multiprocessing.Pool(8) if platform.system() == "AIX" \ 437 else multiprocessing.Pool() 438 try: 439 # Only one argument can be passed to the mapping function, and we can't 440 # use a lambda or local function definition as that doesn't work on 441 # windows, so create a list of tuples which duplicates the arguments 442 # that are the same in all calls. 443 vals = [ 444 (args.nm, should_keep_symbol, calling_convention_decoration, x) 445 for x in libs 446 ] 447 # Do an async map then wait for the result to make sure that 448 # KeyboardInterrupt gets caught correctly (see 449 # http://bugs.python.org/issue8296) 450 result = pool.map_async(extract_symbols, vals) 451 pool.close() 452 libs_symbols = result.get(3600) 453 except KeyboardInterrupt: 454 # On Ctrl-C terminate everything and exit 455 pool.terminate() 456 pool.join() 457 exit(1) 458 459 # Merge everything into a single dict 460 symbol_defs = dict() 461 symbol_refs = set() 462 for (this_lib_defs, this_lib_refs) in libs_symbols: 463 for k, v in list(this_lib_defs.items()): 464 symbol_defs[k] = v + symbol_defs.setdefault(k, 0) 465 for sym in list(this_lib_refs): 466 symbol_refs.add(sym) 467 468 # Find which template instantiations are referenced at least once. 469 template_instantiation_refs = set() 470 for sym in list(symbol_refs): 471 template = get_template_name(sym, args.mangling) 472 if template: 473 template_instantiation_refs.add(template) 474 475 # Print symbols which both: 476 # * Appear in exactly one input, as symbols defined in multiple 477 # objects/libraries are assumed to have public definitions. 478 # * Are not a template instantiation that isn't referenced anywhere. This 479 # is because we need to export any explicitly instantiated templates, 480 # and we expect those to be referenced in some object. 481 if args.o: 482 outfile = open(args.o, "w") 483 else: 484 outfile = sys.stdout 485 for k, v in list(symbol_defs.items()): 486 # On AIX, export function descriptors instead of function entries. 487 if platform.system() == "AIX" and k.startswith("."): 488 continue 489 template = get_template_name(k, args.mangling) 490 if v == 1 and (not template or template in template_instantiation_refs): 491 print(k, file=outfile) 492