xref: /llvm-project/clang/utils/perf-training/perf-helper.py (revision 1a53d4baeb0242e00c494fd0a2b2ce58bcbf28b6)
1# ===- perf-helper.py - Clang Python Bindings -----------------*- python -*--===#
2#
3# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4# See https://llvm.org/LICENSE.txt for license information.
5# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6#
7# ===------------------------------------------------------------------------===#
8
9from __future__ import absolute_import, division, print_function
10
11import sys
12import os
13import subprocess
14import argparse
15import time
16import bisect
17import shlex
18import tempfile
19import re
20import shutil
21
22test_env = {"PATH": os.environ["PATH"]}
23
24
25def findFilesWithExtension(path, extension):
26    filenames = []
27    for root, dirs, files in os.walk(path):
28        for filename in files:
29            if filename.endswith(f".{extension}"):
30                filenames.append(os.path.join(root, filename))
31    return filenames
32
33
34def clean(args):
35    if len(args) < 2:
36        print(
37            "Usage: %s clean <paths> <extension>\n" % __file__
38            + "\tRemoves all files with extension from <path>."
39        )
40        return 1
41    for path in args[0:-1]:
42        for filename in findFilesWithExtension(path, args[-1]):
43            os.remove(filename)
44    return 0
45
46
47def merge(args):
48    if len(args) < 3:
49        print(
50            "Usage: %s merge <llvm-profdata> <output> <paths>\n" % __file__
51            + "\tMerges all profraw files from path into output."
52        )
53        return 1
54    cmd = [args[0], "merge", "-o", args[1]]
55    for path in args[2:]:
56        cmd.extend(findFilesWithExtension(path, "profraw"))
57    subprocess.check_call(cmd)
58    return 0
59
60
61def merge_fdata(args):
62    if len(args) != 3:
63        print(
64            "Usage: %s merge-fdata <merge-fdata> <output> <path>\n" % __file__
65            + "\tMerges all fdata files from path into output."
66        )
67        return 1
68    cmd = [args[0], "-o", args[1]]
69    cmd.extend(findFilesWithExtension(args[2], "fdata"))
70    subprocess.check_call(cmd)
71    return 0
72
73
74def perf(args):
75    parser = argparse.ArgumentParser(
76        prog="perf-helper perf", description="perf wrapper for BOLT profile collection"
77    )
78    parser.add_argument(
79        "--lbr", action="store_true", help="Use perf with branch stacks"
80    )
81    parser.add_argument("cmd", nargs=argparse.REMAINDER, help="")
82
83    opts = parser.parse_args(args)
84    cmd = opts.cmd[1:]
85
86    perf_args = [
87        "perf",
88        "record",
89        "--event=cycles:u",
90        "--freq=max",
91        "--output=%d.perf.data" % os.getpid(),
92    ]
93    if opts.lbr:
94        perf_args += ["--branch-filter=any,u"]
95    perf_args.extend(cmd)
96
97    start_time = time.time()
98    subprocess.check_call(perf_args)
99
100    elapsed = time.time() - start_time
101    print("... data collection took %.4fs" % elapsed)
102    return 0
103
104
105def perf2bolt(args):
106    parser = argparse.ArgumentParser(
107        prog="perf-helper perf2bolt",
108        description="perf2bolt conversion wrapper for perf.data files",
109    )
110    parser.add_argument("bolt", help="Path to llvm-bolt")
111    parser.add_argument("path", help="Path containing perf.data files")
112    parser.add_argument("binary", help="Input binary")
113    parser.add_argument("--lbr", action="store_true", help="Use LBR perf2bolt mode")
114    opts = parser.parse_args(args)
115
116    p2b_args = [
117        opts.bolt,
118        opts.binary,
119        "--aggregate-only",
120        "--profile-format=yaml",
121    ]
122    if not opts.lbr:
123        p2b_args += ["-nl"]
124    p2b_args += ["-p"]
125    for filename in findFilesWithExtension(opts.path, "perf.data"):
126        subprocess.check_call(p2b_args + [filename, "-o", filename + ".fdata"])
127    return 0
128
129
130def dtrace(args):
131    parser = argparse.ArgumentParser(
132        prog="perf-helper dtrace",
133        description="dtrace wrapper for order file generation",
134    )
135    parser.add_argument(
136        "--buffer-size",
137        metavar="size",
138        type=int,
139        required=False,
140        default=1,
141        help="dtrace buffer size in MB (default 1)",
142    )
143    parser.add_argument(
144        "--use-oneshot",
145        required=False,
146        action="store_true",
147        help="Use dtrace's oneshot probes",
148    )
149    parser.add_argument(
150        "--use-ustack",
151        required=False,
152        action="store_true",
153        help="Use dtrace's ustack to print function names",
154    )
155    parser.add_argument(
156        "--cc1",
157        required=False,
158        action="store_true",
159        help="Execute cc1 directly (don't profile the driver)",
160    )
161    parser.add_argument("cmd", nargs="*", help="")
162
163    # Use python's arg parser to handle all leading option arguments, but pass
164    # everything else through to dtrace
165    first_cmd = next(arg for arg in args if not arg.startswith("--"))
166    last_arg_idx = args.index(first_cmd)
167
168    opts = parser.parse_args(args[:last_arg_idx])
169    cmd = args[last_arg_idx:]
170
171    if opts.cc1:
172        cmd = get_cc1_command_for_args(cmd, test_env)
173
174    if opts.use_oneshot:
175        target = "oneshot$target:::entry"
176    else:
177        target = "pid$target:::entry"
178    predicate = '%s/probemod=="%s"/' % (target, os.path.basename(cmd[0]))
179    log_timestamp = 'printf("dtrace-TS: %d\\n", timestamp)'
180    if opts.use_ustack:
181        action = "ustack(1);"
182    else:
183        action = 'printf("dtrace-Symbol: %s\\n", probefunc);'
184    dtrace_script = "%s { %s; %s }" % (predicate, log_timestamp, action)
185
186    dtrace_args = []
187    if not os.geteuid() == 0:
188        print(
189            "Script must be run as root, or you must add the following to your sudoers:"
190            + "%%admin ALL=(ALL) NOPASSWD: /usr/sbin/dtrace"
191        )
192        dtrace_args.append("sudo")
193
194    dtrace_args.extend(
195        (
196            "dtrace",
197            "-xevaltime=exec",
198            "-xbufsize=%dm" % (opts.buffer_size),
199            "-q",
200            "-n",
201            dtrace_script,
202            "-c",
203            " ".join(cmd),
204        )
205    )
206
207    if sys.platform == "darwin":
208        dtrace_args.append("-xmangled")
209
210    start_time = time.time()
211
212    with open("%d.dtrace" % os.getpid(), "w") as f:
213        f.write("### Command: %s" % dtrace_args)
214        subprocess.check_call(dtrace_args, stdout=f, stderr=subprocess.PIPE)
215
216    elapsed = time.time() - start_time
217    print("... data collection took %.4fs" % elapsed)
218
219    return 0
220
221
222def get_cc1_command_for_args(cmd, env):
223    # Find the cc1 command used by the compiler. To do this we execute the
224    # compiler with '-###' to figure out what it wants to do.
225    cmd = cmd + ["-###"]
226    cc_output = subprocess.check_output(
227        cmd, stderr=subprocess.STDOUT, env=env, universal_newlines=True
228    ).strip()
229    cc_commands = []
230    for ln in cc_output.split("\n"):
231        # Filter out known garbage.
232        if (
233            ln == "Using built-in specs."
234            or ln.startswith("Configured with:")
235            or ln.startswith("Target:")
236            or ln.startswith("Thread model:")
237            or ln.startswith("InstalledDir:")
238            or ln.startswith("LLVM Profile Note")
239            or ln.startswith(" (in-process)")
240            or " version " in ln
241        ):
242            continue
243        cc_commands.append(ln)
244
245    if len(cc_commands) != 1:
246        print("Fatal error: unable to determine cc1 command: %r" % cc_output)
247        exit(1)
248
249    cc1_cmd = shlex.split(cc_commands[0])
250    if not cc1_cmd:
251        print("Fatal error: unable to determine cc1 command: %r" % cc_output)
252        exit(1)
253
254    return cc1_cmd
255
256
257def cc1(args):
258    parser = argparse.ArgumentParser(
259        prog="perf-helper cc1", description="cc1 wrapper for order file generation"
260    )
261    parser.add_argument("cmd", nargs="*", help="")
262
263    # Use python's arg parser to handle all leading option arguments, but pass
264    # everything else through to dtrace
265    first_cmd = next(arg for arg in args if not arg.startswith("--"))
266    last_arg_idx = args.index(first_cmd)
267
268    opts = parser.parse_args(args[:last_arg_idx])
269    cmd = args[last_arg_idx:]
270
271    # clear the profile file env, so that we don't generate profdata
272    # when capturing the cc1 command
273    cc1_env = test_env
274    cc1_env["LLVM_PROFILE_FILE"] = os.devnull
275    cc1_cmd = get_cc1_command_for_args(cmd, cc1_env)
276
277    subprocess.check_call(cc1_cmd)
278    return 0
279
280
281def parse_dtrace_symbol_file(path, all_symbols, all_symbols_set, missing_symbols, opts):
282    def fix_mangling(symbol):
283        if sys.platform == "darwin":
284            if symbol[0] != "_" and symbol != "start":
285                symbol = "_" + symbol
286        return symbol
287
288    def get_symbols_with_prefix(symbol):
289        start_index = bisect.bisect_left(all_symbols, symbol)
290        for s in all_symbols[start_index:]:
291            if not s.startswith(symbol):
292                break
293            yield s
294
295    # Extract the list of symbols from the given file, which is assumed to be
296    # the output of a dtrace run logging either probefunc or ustack(1) and
297    # nothing else. The dtrace -xdemangle option needs to be used.
298    #
299    # This is particular to OS X at the moment, because of the '_' handling.
300    with open(path) as f:
301        current_timestamp = None
302        for ln in f:
303            # Drop leading and trailing whitespace.
304            ln = ln.strip()
305            if not ln.startswith("dtrace-"):
306                continue
307
308            # If this is a timestamp specifier, extract it.
309            if ln.startswith("dtrace-TS: "):
310                _, data = ln.split(": ", 1)
311                if not data.isdigit():
312                    print(
313                        "warning: unrecognized timestamp line %r, ignoring" % ln,
314                        file=sys.stderr,
315                    )
316                    continue
317                current_timestamp = int(data)
318                continue
319            elif ln.startswith("dtrace-Symbol: "):
320
321                _, ln = ln.split(": ", 1)
322                if not ln:
323                    continue
324
325                # If there is a '`' in the line, assume it is a ustack(1) entry in
326                # the form of <modulename>`<modulefunc>, where <modulefunc> is never
327                # truncated (but does need the mangling patched).
328                if "`" in ln:
329                    yield (current_timestamp, fix_mangling(ln.split("`", 1)[1]))
330                    continue
331
332                # Otherwise, assume this is a probefunc printout. DTrace on OS X
333                # seems to have a bug where it prints the mangled version of symbols
334                # which aren't C++ mangled. We just add a '_' to anything but start
335                # which doesn't already have a '_'.
336                symbol = fix_mangling(ln)
337
338                # If we don't know all the symbols, or the symbol is one of them,
339                # just return it.
340                if not all_symbols_set or symbol in all_symbols_set:
341                    yield (current_timestamp, symbol)
342                    continue
343
344                # Otherwise, we have a symbol name which isn't present in the
345                # binary. We assume it is truncated, and try to extend it.
346
347                # Get all the symbols with this prefix.
348                possible_symbols = list(get_symbols_with_prefix(symbol))
349                if not possible_symbols:
350                    continue
351
352                # If we found too many possible symbols, ignore this as a prefix.
353                if len(possible_symbols) > 100:
354                    print(
355                        "warning: ignoring symbol %r " % symbol
356                        + "(no match and too many possible suffixes)",
357                        file=sys.stderr,
358                    )
359                    continue
360
361                # Report that we resolved a missing symbol.
362                if opts.show_missing_symbols and symbol not in missing_symbols:
363                    print(
364                        "warning: resolved missing symbol %r" % symbol, file=sys.stderr
365                    )
366                    missing_symbols.add(symbol)
367
368                # Otherwise, treat all the possible matches as having occurred. This
369                # is an over-approximation, but it should be ok in practice.
370                for s in possible_symbols:
371                    yield (current_timestamp, s)
372
373
374def uniq(list):
375    seen = set()
376    for item in list:
377        if item not in seen:
378            yield item
379            seen.add(item)
380
381
382def form_by_call_order(symbol_lists):
383    # Simply strategy, just return symbols in order of occurrence, even across
384    # multiple runs.
385    return uniq(s for symbols in symbol_lists for s in symbols)
386
387
388def form_by_call_order_fair(symbol_lists):
389    # More complicated strategy that tries to respect the call order across all
390    # of the test cases, instead of giving a huge preference to the first test
391    # case.
392
393    # First, uniq all the lists.
394    uniq_lists = [list(uniq(symbols)) for symbols in symbol_lists]
395
396    # Compute the successors for each list.
397    succs = {}
398    for symbols in uniq_lists:
399        for a, b in zip(symbols[:-1], symbols[1:]):
400            succs[a] = items = succs.get(a, [])
401            if b not in items:
402                items.append(b)
403
404    # Emit all the symbols, but make sure to always emit all successors from any
405    # call list whenever we see a symbol.
406    #
407    # There isn't much science here, but this sometimes works better than the
408    # more naive strategy. Then again, sometimes it doesn't so more research is
409    # probably needed.
410    return uniq(
411        s
412        for symbols in symbol_lists
413        for node in symbols
414        for s in ([node] + succs.get(node, []))
415    )
416
417
418def form_by_frequency(symbol_lists):
419    # Form the order file by just putting the most commonly occurring symbols
420    # first. This assumes the data files didn't use the oneshot dtrace method.
421
422    counts = {}
423    for symbols in symbol_lists:
424        for a in symbols:
425            counts[a] = counts.get(a, 0) + 1
426
427    by_count = list(counts.items())
428    by_count.sort(key=lambda __n: -__n[1])
429    return [s for s, n in by_count]
430
431
432def form_by_random(symbol_lists):
433    # Randomize the symbols.
434    merged_symbols = uniq(s for symbols in symbol_lists for s in symbols)
435    random.shuffle(merged_symbols)
436    return merged_symbols
437
438
439def form_by_alphabetical(symbol_lists):
440    # Alphabetize the symbols.
441    merged_symbols = list(set(s for symbols in symbol_lists for s in symbols))
442    merged_symbols.sort()
443    return merged_symbols
444
445
446methods = dict(
447    (name[len("form_by_") :], value)
448    for name, value in locals().items()
449    if name.startswith("form_by_")
450)
451
452
453def genOrderFile(args):
454    parser = argparse.ArgumentParser("%prog  [options] <dtrace data file directories>]")
455    parser.add_argument("input", nargs="+", help="")
456    parser.add_argument(
457        "--binary",
458        metavar="PATH",
459        type=str,
460        dest="binary_path",
461        help="Path to the binary being ordered (for getting all symbols)",
462        default=None,
463    )
464    parser.add_argument(
465        "--output",
466        dest="output_path",
467        help="path to output order file to write",
468        default=None,
469        required=True,
470        metavar="PATH",
471    )
472    parser.add_argument(
473        "--show-missing-symbols",
474        dest="show_missing_symbols",
475        help="show symbols which are 'fixed up' to a valid name (requires --binary)",
476        action="store_true",
477        default=None,
478    )
479    parser.add_argument(
480        "--output-unordered-symbols",
481        dest="output_unordered_symbols_path",
482        help="write a list of the unordered symbols to PATH (requires --binary)",
483        default=None,
484        metavar="PATH",
485    )
486    parser.add_argument(
487        "--method",
488        dest="method",
489        help="order file generation method to use",
490        choices=list(methods.keys()),
491        default="call_order",
492    )
493    opts = parser.parse_args(args)
494
495    # If the user gave us a binary, get all the symbols in the binary by
496    # snarfing 'nm' output.
497    if opts.binary_path is not None:
498        output = subprocess.check_output(
499            ["nm", "-P", opts.binary_path], universal_newlines=True
500        )
501        lines = output.split("\n")
502        all_symbols = [ln.split(" ", 1)[0] for ln in lines if ln.strip()]
503        print("found %d symbols in binary" % len(all_symbols))
504        all_symbols.sort()
505    else:
506        all_symbols = []
507    all_symbols_set = set(all_symbols)
508
509    # Compute the list of input files.
510    input_files = []
511    for dirname in opts.input:
512        input_files.extend(findFilesWithExtension(dirname, "dtrace"))
513
514    # Load all of the input files.
515    print("loading from %d data files" % len(input_files))
516    missing_symbols = set()
517    timestamped_symbol_lists = [
518        list(
519            parse_dtrace_symbol_file(
520                path, all_symbols, all_symbols_set, missing_symbols, opts
521            )
522        )
523        for path in input_files
524    ]
525
526    # Reorder each symbol list.
527    symbol_lists = []
528    for timestamped_symbols_list in timestamped_symbol_lists:
529        timestamped_symbols_list.sort()
530        symbol_lists.append([symbol for _, symbol in timestamped_symbols_list])
531
532    # Execute the desire order file generation method.
533    method = methods.get(opts.method)
534    result = list(method(symbol_lists))
535
536    # Report to the user on what percentage of symbols are present in the order
537    # file.
538    num_ordered_symbols = len(result)
539    if all_symbols:
540        print(
541            "note: order file contains %d/%d symbols (%.2f%%)"
542            % (
543                num_ordered_symbols,
544                len(all_symbols),
545                100.0 * num_ordered_symbols / len(all_symbols),
546            ),
547            file=sys.stderr,
548        )
549
550    if opts.output_unordered_symbols_path:
551        ordered_symbols_set = set(result)
552        with open(opts.output_unordered_symbols_path, "w") as f:
553            f.write("\n".join(s for s in all_symbols if s not in ordered_symbols_set))
554
555    # Write the order file.
556    with open(opts.output_path, "w") as f:
557        f.write("\n".join(result))
558        f.write("\n")
559
560    return 0
561
562
563def bolt_optimize(args):
564    parser = argparse.ArgumentParser("%prog  [options] ")
565    parser.add_argument("--method", choices=["INSTRUMENT", "PERF", "LBR"])
566    parser.add_argument("--input")
567    parser.add_argument("--instrumented-output")
568    parser.add_argument("--fdata")
569    parser.add_argument("--perf-training-binary-dir")
570    parser.add_argument("--readelf")
571    parser.add_argument("--bolt")
572    parser.add_argument("--lit")
573    parser.add_argument("--merge-fdata")
574
575    opts = parser.parse_args(args)
576
577    output = subprocess.check_output(
578        [opts.readelf, "-WS", opts.input], universal_newlines=True
579    )
580
581    # This binary has already been bolt-optimized, so skip further processing.
582    if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
583        return 0
584
585    if opts.method == "INSTRUMENT":
586        process = subprocess.run(
587            [
588                opts.bolt,
589                opts.input,
590                "-o",
591                opts.instrumented_output,
592                "-instrument",
593                "--instrumentation-file-append-pid",
594                f"--instrumentation-file={opts.fdata}",
595            ],
596            stdout=subprocess.PIPE,
597            stderr=subprocess.STDOUT,
598            text=True,
599        )
600
601        print(process.args)
602        for line in process.stdout:
603            sys.stdout.write(line)
604        process.check_returncode()
605
606    process = subprocess.run(
607        [
608            sys.executable,
609            opts.lit,
610            os.path.join(opts.perf_training_binary_dir, "bolt-fdata"),
611        ],
612        stdout=subprocess.PIPE,
613        stderr=subprocess.STDOUT,
614        text=True,
615    )
616
617    print(process.args)
618    for line in process.stdout:
619        sys.stdout.write(line)
620    process.check_returncode()
621
622    if opts.method in ["PERF", "LBR"]:
623        perf2bolt([opts.bolt, opts.perf_training_binary_dir, opts.input])
624
625    merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])
626
627    shutil.copy(opts.input, f"{opts.input}-prebolt")
628
629    process = subprocess.run(
630        [
631            opts.bolt,
632            f"{opts.input}-prebolt",
633            "-o",
634            opts.input,
635            "-data",
636            opts.fdata,
637            "-reorder-blocks=ext-tsp",
638            "-reorder-functions=cdsort",
639            "-split-functions",
640            "-split-all-cold",
641            "-split-eh",
642            "-dyno-stats",
643            "-use-gnu-stack",
644            "-update-debug-sections",
645            "-nl" if opts.method == "PERF" else "",
646        ],
647        stdout=subprocess.PIPE,
648        stderr=subprocess.STDOUT,
649        text=True,
650    )
651
652    print(process.args)
653    for line in process.stdout:
654        sys.stdout.write(line)
655    process.check_returncode()
656
657
658commands = {
659    "bolt-optimize": bolt_optimize,
660    "clean": clean,
661    "merge": merge,
662    "dtrace": dtrace,
663    "cc1": cc1,
664    "gen-order-file": genOrderFile,
665    "merge-fdata": merge_fdata,
666    "perf": perf,
667    "perf2bolt": perf2bolt,
668}
669
670
671def main():
672    f = commands[sys.argv[1]]
673    sys.exit(f(sys.argv[2:]))
674
675
676if __name__ == "__main__":
677    main()
678