xref: /llvm-project/llvm/utils/extract_symbols.py (revision e7303fe80a0bea124422219356c1c9e845110a77)
1#!/usr/bin/env python
2
3"""A tool for extracting a list of symbols to export
4
5When exporting symbols from a dll or exe we either need to mark the symbols in
6the source code as __declspec(dllexport) or supply a list of symbols to the
7linker. This program automates the latter by inspecting the symbol tables of a
8list of link inputs and deciding which of those symbols need to be exported.
9
10We can't just export all the defined symbols, as there's a limit of 65535
11exported symbols and in clang we go way over that, particularly in a debug
12build. Therefore a large part of the work is pruning symbols either which can't
13be imported, or which we think are things that have definitions in public header
14files (i.e. template instantiations) and we would get defined in the thing
15importing these symbols anyway.
16"""
17
18from __future__ import print_function
19import sys
20import re
21import os
22import subprocess
23import multiprocessing
24import argparse
25import platform
26
27# Define a function which extracts a list of pairs of (symbols, is_def) from a
28# library using llvm-nm becuase it can work both with regular and bitcode files.
29# We use subprocess.Popen and yield a symbol at a time instead of using
30# subprocess.check_output and returning a list as, especially on Windows, waiting
31# for the entire output to be ready can take a significant amount of time.
32def nm_get_symbols(tool, lib):
33    # '-P' means the output is in portable format,
34    # '-g' means we only get global symbols,
35    # '-Xany' enforce handling both 32- and 64-bit objects on AIX,
36    # '--no-demangle' ensure that C++ symbol names are not demangled; note
37    #   that llvm-nm do not demangle by default, but the system nm on AIX does
38    #   that, so the behavior may change in the future,
39    # '-p' do not waste time sorting the symbols.
40    cmd = [tool, "-P", "-g", "-Xany", "--no-demangle", "-p"]
41    process = subprocess.Popen(
42        cmd + [lib],
43        bufsize=1,
44        stdout=subprocess.PIPE,
45        stdin=subprocess.PIPE,
46        universal_newlines=True,
47    )
48    process.stdin.close()
49    for line in process.stdout:
50        # Look for external symbols that are defined in some section
51        # The POSIX format is:
52        #   name   type   value   size
53        # The -P flag displays the size field for symbols only when applicable,
54        # so the last field is optional. There's no space after the value field,
55        # but \s+ match newline also, so \s+\S* will match the optional size field.
56        match = re.match(r"^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line)
57        if match:
58            yield (match.group(1), True)
59        # Look for undefined symbols, which have type U and may or may not
60        # (depending on which nm is being used) have value and size.
61        match = re.match(r"^(\S+)\s+U\s+(\S+\s+\S*)?$", line)
62        if match:
63            yield (match.group(1), False)
64    process.wait()
65
66
67# Define a function which determines if the target is 32-bit Windows (as that's
68# where calling convention name decoration happens).
69def readobj_is_32bit_windows(tool, lib):
70    output = subprocess.check_output(
71        [tool, "--file-header", lib], universal_newlines=True
72    )
73    for line in output.splitlines():
74        match = re.match(r"Format: (\S+)", line)
75        if match:
76            return match.group(1) == "COFF-i386"
77    return False
78
79
80# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
81# identifier/type mangling we can decide which symbols could possibly be
82# required and which we can discard.
83def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
84    # Keep unmangled (i.e. extern "C") names
85    if not "?" in symbol:
86        if calling_convention_decoration:
87            # Remove calling convention decoration from names
88            match = re.match(r"[_@]([^@]+)", symbol)
89            if match:
90                symbol = match.group(1)
91        # Discard floating point/SIMD constants.
92        if symbol.startswith(("__xmm@", "__ymm@", "__real@")):
93            return None
94        return symbol
95    # Deleting destructors start with ?_G or ?_E and can be discarded because
96    # link.exe gives you a warning telling you they can't be exported if you
97    # don't
98    elif symbol.startswith("??_G") or symbol.startswith("??_E"):
99        return None
100    # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
101    # that mentions an anonymous namespace can be discarded, as the anonymous
102    # namespace doesn't exist outside of that translation unit.
103    elif re.search(r"\?A(0x\w+)?@", symbol):
104        return None
105    # Skip X86GenMnemonicTables functions, they are not exposed from llvm/include/.
106    elif re.match(r"\?is[A-Z0-9]*@X86@llvm", symbol):
107        return None
108    # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
109    # bit of a mess and imprecise, but that avoids having to completely demangle
110    # the symbol name. The outermost namespace is at the end of the identifier
111    # mangling, and the identifier mangling is followed by the type mangling, so
112    # we look for (llvm|clang)@@ followed by something that looks like a
113    # function type mangling. To spot a function type we use (this is derived
114    # from clang/lib/AST/MicrosoftMangle.cpp):
115    # <function-type> ::= <function-class> <this-cvr-qualifiers>
116    #                     <calling-convention> <return-type>
117    #                     <argument-list> <throw-spec>
118    # <function-class> ::= [A-Z]
119    # <this-cvr-qualifiers> ::= [A-Z0-9_]*
120    # <calling-convention> ::= [A-JQ]
121    # <return-type> ::= .+
122    # <argument-list> ::= X   (void)
123    #                 ::= .+@ (list of types)
124    #                 ::= .*Z (list of types, varargs)
125    # <throw-spec> ::= exceptions are not allowed
126    elif re.search(r"(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol):
127        return symbol
128    return None
129
130
131# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
132# demangle the identifier mangling to identify symbols that can be safely
133# discarded.
134def should_keep_itanium_symbol(symbol, calling_convention_decoration):
135    # Start by removing any calling convention decoration (which we expect to
136    # see on all symbols, even mangled C++ symbols)
137    if calling_convention_decoration and symbol.startswith("_"):
138        symbol = symbol[1:]
139    # Keep unmangled names
140    if not symbol.startswith("_") and not symbol.startswith("."):
141        return symbol
142    # Discard manglings that aren't nested names
143    match = re.match(r"\.?_Z(T[VTIS])?(N.+)", symbol)
144    if not match:
145        return None
146    # Demangle the name. If the name is too complex then we don't need to keep
147    # it, but it the demangling fails then keep the symbol just in case.
148    try:
149        names, _ = parse_itanium_nested_name(match.group(2))
150    except TooComplexName:
151        return None
152    if not names:
153        return symbol
154    # Keep llvm:: and clang:: names
155    elif names[0][0] == "4llvm" or names[0][0] == "5clang":
156        return symbol
157    # Discard everything else
158    else:
159        return None
160
161
162# Certain kinds of complex manglings we assume cannot be part of a public
163# interface, and we handle them by raising an exception.
164class TooComplexName(Exception):
165    pass
166
167
168# Parse an itanium mangled name from the start of a string and return a
169# (name, rest of string) pair.
170def parse_itanium_name(arg):
171    # Check for a normal name
172    match = re.match(r"(\d+)(.+)", arg)
173    if match:
174        n = int(match.group(1))
175        name = match.group(1) + match.group(2)[:n]
176        rest = match.group(2)[n:]
177        return name, rest
178    # Check for constructor/destructor names
179    match = re.match(r"([CD][123])(.+)", arg)
180    if match:
181        return match.group(1), match.group(2)
182    # Assume that a sequence of characters that doesn't end a nesting is an
183    # operator (this is very imprecise, but appears to be good enough)
184    match = re.match(r"([^E]+)(.+)", arg)
185    if match:
186        return match.group(1), match.group(2)
187    # Anything else: we can't handle it
188    return None, arg
189
190
191# Parse an itanium mangled template argument list from the start of a string
192# and throw it away, returning the rest of the string.
193def skip_itanium_template(arg):
194    # A template argument list starts with I
195    assert arg.startswith("I"), arg
196    tmp = arg[1:]
197    while tmp:
198        # Check for names
199        match = re.match(r"(\d+)(.+)", tmp)
200        if match:
201            n = int(match.group(1))
202            tmp = match.group(2)[n:]
203            continue
204        # Check for substitutions
205        match = re.match(r"S[A-Z0-9]*_(.+)", tmp)
206        if match:
207            tmp = match.group(1)
208        # Start of a template
209        elif tmp.startswith("I"):
210            tmp = skip_itanium_template(tmp)
211        # Start of a nested name
212        elif tmp.startswith("N"):
213            _, tmp = parse_itanium_nested_name(tmp)
214        # Start of an expression: assume that it's too complicated
215        elif tmp.startswith("L") or tmp.startswith("X"):
216            raise TooComplexName
217        # End of the template
218        elif tmp.startswith("E"):
219            return tmp[1:]
220        # Something else: probably a type, skip it
221        else:
222            tmp = tmp[1:]
223    return None
224
225
226# Parse an itanium mangled nested name and transform it into a list of pairs of
227# (name, is_template), returning (list, rest of string).
228def parse_itanium_nested_name(arg):
229    # A nested name starts with N
230    assert arg.startswith("N"), arg
231    ret = []
232
233    # Skip past the N, and possibly a substitution
234    match = re.match(r"NS[A-Z0-9]*_(.+)", arg)
235    if match:
236        tmp = match.group(1)
237    else:
238        tmp = arg[1:]
239
240    # Skip past CV-qualifiers and ref qualifiers
241    match = re.match(r"[rVKRO]*(.+)", tmp)
242    if match:
243        tmp = match.group(1)
244
245    # Repeatedly parse names from the string until we reach the end of the
246    # nested name
247    while tmp:
248        # An E ends the nested name
249        if tmp.startswith("E"):
250            return ret, tmp[1:]
251        # Parse a name
252        name_part, tmp = parse_itanium_name(tmp)
253        if not name_part:
254            # If we failed then we don't know how to demangle this
255            return None, None
256        is_template = False
257        # If this name is a template record that, then skip the template
258        # arguments
259        if tmp.startswith("I"):
260            tmp = skip_itanium_template(tmp)
261            is_template = True
262        # Add the name to the list
263        ret.append((name_part, is_template))
264
265    # If we get here then something went wrong
266    return None, None
267
268
269# Parse a microsoft mangled symbol and return a list of pairs of
270# (name, is_template). This is very rudimentary and does just enough
271# in order to determine if the first or second component is a template.
272def parse_microsoft_mangling(arg):
273    # If the name doesn't start with ? this isn't a mangled name
274    if not arg.startswith("?"):
275        return [(arg, False)]
276    arg = arg[1:]
277    components = []
278    while len(arg) > 0:
279        # If we see an empty component we've reached the end
280        if arg.startswith("@"):
281            return components
282        # Check for a simple name
283        match = re.match(r"(\w+)@(.+)", arg)
284        if match:
285            components.append((match.group(1), False))
286            arg = match.group(2)
287            continue
288        # Check for a special function name
289        match = re.match(r"(\?_?\w)(.+)", arg)
290        if match:
291            components.append((match.group(1), False))
292            arg = match.group(2)
293            continue
294        # Check for a template name
295        match = re.match(r"\?\$(\w+)@[^@]+@(.+)", arg)
296        if match:
297            components.append((match.group(1), True))
298            arg = match.group(2)
299            continue
300        # Some other kind of name that we can't handle
301        components.append((arg, False))
302        return components
303    return components
304
305
306def extract_symbols(arg):
307    llvm_nm_path, should_keep_symbol, calling_convention_decoration, lib = arg
308    symbol_defs = dict()
309    symbol_refs = set()
310    for (symbol, is_def) in nm_get_symbols(llvm_nm_path, lib):
311        symbol = should_keep_symbol(symbol, calling_convention_decoration)
312        if symbol:
313            if is_def:
314                symbol_defs[symbol] = 1 + symbol_defs.setdefault(symbol, 0)
315            else:
316                symbol_refs.add(symbol)
317    return (symbol_defs, symbol_refs)
318
319
320def get_template_name(sym, mangling):
321    # Parse the mangling into a list of (name, is_template)
322    try:
323        if mangling == "microsoft":
324            names = parse_microsoft_mangling(sym)
325        else:
326            match = re.match(r"\.?_Z(T[VTIS])?(N.+)", sym)
327            if match:
328                names, _ = parse_itanium_nested_name(match.group(2))
329            else:
330                names = None
331    except TooComplexName:
332        return None
333
334    if not names:
335        return None
336
337    # If any component is a template then return it
338    for name, is_template in names:
339        if is_template:
340            return name
341
342    # Not a template
343    return None
344
345
346def parse_tool_path(parser, tool, val):
347    try:
348        # Close std streams as we don't want any output and we don't
349        # want the process to wait for something on stdin.
350        p = subprocess.Popen(
351            [val],
352            stdout=subprocess.PIPE,
353            stderr=subprocess.PIPE,
354            stdin=subprocess.PIPE,
355            universal_newlines=True,
356        )
357        p.stdout.close()
358        p.stderr.close()
359        p.stdin.close()
360        p.wait()
361        return val
362    except Exception:
363        parser.error(f"Invalid path for {tool}")
364
365
366if __name__ == "__main__":
367    parser = argparse.ArgumentParser(
368        description="Extract symbols to export from libraries"
369    )
370    parser.add_argument(
371        "--mangling",
372        choices=["itanium", "microsoft"],
373        required=True,
374        help="expected symbol mangling scheme",
375    )
376    parser.add_argument(
377        "--nm",
378        metavar="path",
379        type=lambda x: parse_tool_path(parser, "nm", x),
380        help="path to the llvm-nm executable",
381    )
382    parser.add_argument(
383        "--readobj",
384        metavar="path",
385        type=lambda x: parse_tool_path(parser, "readobj", x),
386        help="path to the llvm-readobj executable",
387    )
388    parser.add_argument(
389        "libs",
390        metavar="lib",
391        type=str,
392        nargs="+",
393        help="libraries to extract symbols from",
394    )
395    parser.add_argument("-o", metavar="file", type=str, help="output to file")
396    args = parser.parse_args()
397
398    # How we determine which symbols to keep and which to discard depends on
399    # the mangling scheme
400    if args.mangling == "microsoft":
401        should_keep_symbol = should_keep_microsoft_symbol
402    else:
403        should_keep_symbol = should_keep_itanium_symbol
404
405    # Get the list of libraries to extract symbols from
406    libs = list()
407    for lib in args.libs:
408        # When invoked by cmake the arguments are the cmake target names of the
409        # libraries, so we need to add .lib/.a to the end and maybe lib to the
410        # start to get the filename. Also allow objects.
411        suffixes = [".lib", ".a", ".obj", ".o"]
412        if not any([lib.endswith(s) for s in suffixes]):
413            for s in suffixes:
414                if os.path.exists(lib + s):
415                    lib = lib + s
416                    break
417                if os.path.exists("lib" + lib + s):
418                    lib = "lib" + lib + s
419                    break
420        if not any([lib.endswith(s) for s in suffixes]):
421            print("Don't know what to do with argument " + lib, file=sys.stderr)
422            exit(1)
423        libs.append(lib)
424
425    # Check if calling convention decoration is used by inspecting the first
426    # library in the list
427    calling_convention_decoration = readobj_is_32bit_windows(args.readobj, libs[0])
428
429    # Extract symbols from libraries in parallel. This is a huge time saver when
430    # doing a debug build, as there are hundreds of thousands of symbols in each
431    # library.
432    # FIXME: On AIX, the default pool size can be too big for a logical
433    #        partition's allocated memory, and can lead to an out of memory
434    #        IO error. We are setting the pool size to 8 to avoid such
435    #        errors at the moment, and will look for a graceful solution later.
436    pool = multiprocessing.Pool(8) if platform.system() == "AIX" \
437                                   else multiprocessing.Pool()
438    try:
439        # Only one argument can be passed to the mapping function, and we can't
440        # use a lambda or local function definition as that doesn't work on
441        # windows, so create a list of tuples which duplicates the arguments
442        # that are the same in all calls.
443        vals = [
444            (args.nm, should_keep_symbol, calling_convention_decoration, x)
445            for x in libs
446        ]
447        # Do an async map then wait for the result to make sure that
448        # KeyboardInterrupt gets caught correctly (see
449        # http://bugs.python.org/issue8296)
450        result = pool.map_async(extract_symbols, vals)
451        pool.close()
452        libs_symbols = result.get(3600)
453    except KeyboardInterrupt:
454        # On Ctrl-C terminate everything and exit
455        pool.terminate()
456        pool.join()
457        exit(1)
458
459    # Merge everything into a single dict
460    symbol_defs = dict()
461    symbol_refs = set()
462    for (this_lib_defs, this_lib_refs) in libs_symbols:
463        for k, v in list(this_lib_defs.items()):
464            symbol_defs[k] = v + symbol_defs.setdefault(k, 0)
465        for sym in list(this_lib_refs):
466            symbol_refs.add(sym)
467
468    # Find which template instantiations are referenced at least once.
469    template_instantiation_refs = set()
470    for sym in list(symbol_refs):
471        template = get_template_name(sym, args.mangling)
472        if template:
473            template_instantiation_refs.add(template)
474
475    # Print symbols which both:
476    #  * Appear in exactly one input, as symbols defined in multiple
477    #    objects/libraries are assumed to have public definitions.
478    #  * Are not a template instantiation that isn't referenced anywhere. This
479    #    is because we need to export any explicitly instantiated templates,
480    #    and we expect those to be referenced in some object.
481    if args.o:
482        outfile = open(args.o, "w")
483    else:
484        outfile = sys.stdout
485    for k, v in list(symbol_defs.items()):
486        # On AIX, export function descriptors instead of function entries.
487        if platform.system() == "AIX" and k.startswith("."):
488            continue
489        template = get_template_name(k, args.mangling)
490        if v == 1 and (not template or template in template_instantiation_refs):
491            print(k, file=outfile)
492