utils/llvm-locstats/llvm-locstats.py

#!/usr/bin/env python3
#
# This is a tool that works like debug location coverage calculator.
# It parses the llvm-dwarfdump --statistics output by reporting it
# in a more human readable way.
#

from __future__ import print_function
import argparse
import os
import sys
from json import loads
from math import ceil
from collections import OrderedDict
from subprocess import Popen, PIPE

# This special value has been used to mark statistics that overflowed.
TAINT_VALUE = "tainted"

# Initialize the plot.
def init_plot(plt):
    plt.title("Debug Location Statistics", fontweight="bold")
    plt.xlabel("location buckets")
    plt.ylabel("number of variables in the location buckets")
    plt.xticks(rotation=45, fontsize="x-small")
    plt.yticks()


# Finalize the plot.
def finish_plot(plt):
    plt.legend()
    plt.grid(color="grey", which="major", axis="y", linestyle="-", linewidth=0.3)
    plt.savefig("locstats.png")
    print('The plot was saved within "locstats.png".')


# Holds the debug location statistics.
class LocationStats:
    def __init__(
        self,
        file_name,
        variables_total,
        variables_total_locstats,
        variables_with_loc,
        variables_scope_bytes_covered,
        variables_scope_bytes,
        variables_coverage_map,
    ):
        self.file_name = file_name
        self.variables_total = variables_total
        self.variables_total_locstats = variables_total_locstats
        self.variables_with_loc = variables_with_loc
        self.scope_bytes_covered = variables_scope_bytes_covered
        self.scope_bytes = variables_scope_bytes
        self.variables_coverage_map = variables_coverage_map

    # Get the PC ranges coverage.
    def get_pc_coverage(self):
        if self.scope_bytes_covered == TAINT_VALUE or self.scope_bytes == TAINT_VALUE:
            return TAINT_VALUE
        pc_ranges_covered = int(
            ceil(self.scope_bytes_covered * 100.0) / self.scope_bytes
        )
        return pc_ranges_covered

    # Pretty print the debug location buckets.
    def pretty_print(self):
        if self.scope_bytes == 0:
            print("No scope bytes found.")
            return -1

        pc_ranges_covered = self.get_pc_coverage()
        variables_coverage_per_map = {}
        for cov_bucket in coverage_buckets():
            variables_coverage_per_map[cov_bucket] = None
            if (
                self.variables_coverage_map[cov_bucket] == TAINT_VALUE
                or self.variables_total_locstats == TAINT_VALUE
            ):
                variables_coverage_per_map[cov_bucket] = TAINT_VALUE
            else:
                variables_coverage_per_map[cov_bucket] = int(
                    ceil(self.variables_coverage_map[cov_bucket] * 100.0)
                    / self.variables_total_locstats
                )

        print(" =================================================")
        print("            Debug Location Statistics       ")
        print(" =================================================")
        print("     cov%           samples         percentage(~)  ")
        print(" -------------------------------------------------")
        for cov_bucket in coverage_buckets():
            if (
                self.variables_coverage_map[cov_bucket]
                or self.variables_total_locstats == TAINT_VALUE
            ):
                print(
                    "   {0:10}     {1:8}              {2:3}%".format(
                        cov_bucket,
                        self.variables_coverage_map[cov_bucket],
                        variables_coverage_per_map[cov_bucket],
                    )
                )
            else:
                print(
                    "   {0:10}     {1:8d}              {2:3d}%".format(
                        cov_bucket,
                        self.variables_coverage_map[cov_bucket],
                        variables_coverage_per_map[cov_bucket],
                    )
                )
        print(" =================================================")
        print(
            " -the number of debug variables processed: "
            + str(self.variables_total_locstats)
        )
        print(" -PC ranges covered: " + str(pc_ranges_covered) + "%")

        # Only if we are processing all the variables output the total
        # availability.
        if self.variables_total and self.variables_with_loc:
            total_availability = None
            if (
                self.variables_total == TAINT_VALUE
                or self.variables_with_loc == TAINT_VALUE
            ):
                total_availability = TAINT_VALUE
            else:
                total_availability = int(
                    ceil(self.variables_with_loc * 100.0) / self.variables_total
                )
            print(" -------------------------------------------------")
            print(" -total availability: " + str(total_availability) + "%")
        print(" =================================================")

        return 0

    # Draw a plot representing the location buckets.
    def draw_plot(self):
        from matplotlib import pyplot as plt

        buckets = range(len(self.variables_coverage_map))
        plt.figure(figsize=(12, 8))
        init_plot(plt)
        plt.bar(
            buckets,
            self.variables_coverage_map.values(),
            align="center",
            tick_label=self.variables_coverage_map.keys(),
            label="variables of {}".format(self.file_name),
        )

        # Place the text box with the coverage info.
        pc_ranges_covered = self.get_pc_coverage()
        props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)
        plt.text(
            0.02,
            0.90,
            "PC ranges covered: {}%".format(pc_ranges_covered),
            transform=plt.gca().transAxes,
            fontsize=12,
            verticalalignment="top",
            bbox=props,
        )

        finish_plot(plt)

    # Compare the two LocationStats objects and draw a plot showing
    # the difference.
    def draw_location_diff(self, locstats_to_compare):
        from matplotlib import pyplot as plt

        pc_ranges_covered = self.get_pc_coverage()
        pc_ranges_covered_to_compare = locstats_to_compare.get_pc_coverage()

        buckets = range(len(self.variables_coverage_map))
        buckets_to_compare = range(len(locstats_to_compare.variables_coverage_map))

        fig = plt.figure(figsize=(12, 8))
        ax = fig.add_subplot(111)
        init_plot(plt)

        comparison_keys = list(coverage_buckets())
        ax.bar(
            buckets,
            self.variables_coverage_map.values(),
            align="edge",
            width=0.4,
            label="variables of {}".format(self.file_name),
        )
        ax.bar(
            buckets_to_compare,
            locstats_to_compare.variables_coverage_map.values(),
            color="r",
            align="edge",
            width=-0.4,
            label="variables of {}".format(locstats_to_compare.file_name),
        )
        ax.set_xticks(range(len(comparison_keys)))
        ax.set_xticklabels(comparison_keys)

        props = dict(boxstyle="round", facecolor="wheat", alpha=0.5)
        plt.text(
            0.02,
            0.88,
            "{} PC ranges covered: {}%".format(self.file_name, pc_ranges_covered),
            transform=plt.gca().transAxes,
            fontsize=12,
            verticalalignment="top",
            bbox=props,
        )
        plt.text(
            0.02,
            0.83,
            "{} PC ranges covered: {}%".format(
                locstats_to_compare.file_name, pc_ranges_covered_to_compare
            ),
            transform=plt.gca().transAxes,
            fontsize=12,
            verticalalignment="top",
            bbox=props,
        )

        finish_plot(plt)


# Define the location buckets.
def coverage_buckets():
    yield "0%"
    yield "(0%,10%)"
    for start in range(10, 91, 10):
        yield "[{0}%,{1}%)".format(start, start + 10)
    yield "100%"


# Parse the JSON representing the debug statistics, and create a
# LocationStats object.
def parse_locstats(opts, binary):
    # These will be different due to different options enabled.
    variables_total = None
    variables_total_locstats = None
    variables_with_loc = None
    variables_scope_bytes_covered = None
    variables_scope_bytes = None
    variables_scope_bytes_entry_values = None
    variables_coverage_map = OrderedDict()

    # Get the directory of the LLVM tools.
    llvm_dwarfdump_cmd = os.path.join(os.path.dirname(__file__), "llvm-dwarfdump")
    # The statistics llvm-dwarfdump option.
    llvm_dwarfdump_stats_opt = "--statistics"

    # Generate the stats with the llvm-dwarfdump.
    subproc = Popen(
        [llvm_dwarfdump_cmd, llvm_dwarfdump_stats_opt, binary],
        stdin=PIPE,
        stdout=PIPE,
        stderr=PIPE,
        universal_newlines=True,
    )
    cmd_stdout, cmd_stderr = subproc.communicate()

    # TODO: Handle errors that are coming from llvm-dwarfdump.

    # Get the JSON and parse it.
    json_parsed = None

    try:
        json_parsed = loads(cmd_stdout)
    except:
        print("error: No valid llvm-dwarfdump statistics found.")
        sys.exit(1)

    # TODO: Parse the statistics Version from JSON.

    def init_field(name):
        if json_parsed[name] == "overflowed":
            print('warning: "' + name + '" field overflowed.')
            return TAINT_VALUE
        return json_parsed[name]

    if opts.only_variables:
        # Read the JSON only for local variables.
        variables_total_locstats = init_field(
            "#local vars processed by location statistics"
        )
        variables_scope_bytes_covered = init_field(
            "sum_all_local_vars(#bytes in parent scope covered" " by DW_AT_location)"
        )
        variables_scope_bytes = init_field("sum_all_local_vars(#bytes in parent scope)")
        if not opts.ignore_debug_entry_values:
            for cov_bucket in coverage_buckets():
                cov_category = (
                    "#local vars with {} of parent scope covered "
                    "by DW_AT_location".format(cov_bucket)
                )
                variables_coverage_map[cov_bucket] = init_field(cov_category)
        else:
            variables_scope_bytes_entry_values = init_field(
                "sum_all_local_vars(#bytes in parent scope "
                "covered by DW_OP_entry_value)"
            )
            if (
                variables_scope_bytes_covered != TAINT_VALUE
                and variables_scope_bytes_entry_values != TAINT_VALUE
            ):
                variables_scope_bytes_covered = (
                    variables_scope_bytes_covered - variables_scope_bytes_entry_values
                )
            for cov_bucket in coverage_buckets():
                cov_category = (
                    "#local vars - entry values with {} of parent scope "
                    "covered by DW_AT_location".format(cov_bucket)
                )
                variables_coverage_map[cov_bucket] = init_field(cov_category)
    elif opts.only_formal_parameters:
        # Read the JSON only for formal parameters.
        variables_total_locstats = init_field(
            "#params processed by location statistics"
        )
        variables_scope_bytes_covered = init_field(
            "sum_all_params(#bytes in parent scope covered " "by DW_AT_location)"
        )
        variables_scope_bytes = init_field("sum_all_params(#bytes in parent scope)")
        if not opts.ignore_debug_entry_values:
            for cov_bucket in coverage_buckets():
                cov_category = (
                    "#params with {} of parent scope covered "
                    "by DW_AT_location".format(cov_bucket)
                )
                variables_coverage_map[cov_bucket] = init_field(cov_category)
        else:
            variables_scope_bytes_entry_values = init_field(
                "sum_all_params(#bytes in parent scope covered " "by DW_OP_entry_value)"
            )
            if (
                variables_scope_bytes_covered != TAINT_VALUE
                and variables_scope_bytes_entry_values != TAINT_VALUE
            ):
                variables_scope_bytes_covered = (
                    variables_scope_bytes_covered - variables_scope_bytes_entry_values
                )
            for cov_bucket in coverage_buckets():
                cov_category = (
                    "#params - entry values with {} of parent scope covered"
                    " by DW_AT_location".format(cov_bucket)
                )
                variables_coverage_map[cov_bucket] = init_field(cov_category)
    else:
        # Read the JSON for both local variables and formal parameters.
        variables_total = init_field("#source variables")
        variables_with_loc = init_field("#source variables with location")
        variables_total_locstats = init_field(
            "#variables processed by location statistics"
        )
        variables_scope_bytes_covered = init_field(
            "sum_all_variables(#bytes in parent scope covered " "by DW_AT_location)"
        )
        variables_scope_bytes = init_field("sum_all_variables(#bytes in parent scope)")

        if not opts.ignore_debug_entry_values:
            for cov_bucket in coverage_buckets():
                cov_category = (
                    "#variables with {} of parent scope covered "
                    "by DW_AT_location".format(cov_bucket)
                )
                variables_coverage_map[cov_bucket] = init_field(cov_category)
        else:
            variables_scope_bytes_entry_values = init_field(
                "sum_all_variables(#bytes in parent scope covered "
                "by DW_OP_entry_value)"
            )
            if (
                variables_scope_bytes_covered != TAINT_VALUE
                and variables_scope_bytes_entry_values != TAINT_VALUE
            ):
                variables_scope_bytes_covered = (
                    variables_scope_bytes_covered - variables_scope_bytes_entry_values
                )
            for cov_bucket in coverage_buckets():
                cov_category = (
                    "#variables - entry values with {} of parent scope covered "
                    "by DW_AT_location".format(cov_bucket)
                )
                variables_coverage_map[cov_bucket] = init_field(cov_category)

    return LocationStats(
        binary,
        variables_total,
        variables_total_locstats,
        variables_with_loc,
        variables_scope_bytes_covered,
        variables_scope_bytes,
        variables_coverage_map,
    )


# Parse the program arguments.
def parse_program_args(parser):
    parser.add_argument(
        "--only-variables",
        action="store_true",
        default=False,
        help="calculate the location statistics only for local variables",
    )
    parser.add_argument(
        "--only-formal-parameters",
        action="store_true",
        default=False,
        help="calculate the location statistics only for formal parameters",
    )
    parser.add_argument(
        "--ignore-debug-entry-values",
        action="store_true",
        default=False,
        help="ignore the location statistics on locations with " "entry values",
    )
    parser.add_argument(
        "--draw-plot",
        action="store_true",
        default=False,
        help="show histogram of location buckets generated (requires " "matplotlib)",
    )
    parser.add_argument(
        "--compare",
        action="store_true",
        default=False,
        help="compare the debug location coverage on two files provided, "
        "and draw a plot showing the difference  (requires "
        "matplotlib)",
    )
    parser.add_argument("file_names", nargs="+", type=str, help="file to process")

    return parser.parse_args()


# Verify that the program inputs meet the requirements.
def verify_program_inputs(opts):
    if len(sys.argv) < 2:
        print("error: Too few arguments.")
        return False

    if opts.only_variables and opts.only_formal_parameters:
        print("error: Please use just one --only* option.")
        return False

    if not opts.compare and len(opts.file_names) != 1:
        print("error: Please specify only one file to process.")
        return False

    if opts.compare and len(opts.file_names) != 2:
        print("error: Please specify two files to process.")
        return False

    if opts.draw_plot or opts.compare:
        try:
            import matplotlib
        except ImportError:
            print("error: matplotlib not found.")
            return False

    return True


def Main():
    parser = argparse.ArgumentParser()
    opts = parse_program_args(parser)

    if not verify_program_inputs(opts):
        parser.print_help()
        sys.exit(1)

    binary_file = opts.file_names[0]
    locstats = parse_locstats(opts, binary_file)

    if not opts.compare:
        if opts.draw_plot:
            # Draw a histogram representing the location buckets.
            locstats.draw_plot()
        else:
            # Pretty print collected info on the standard output.
            if locstats.pretty_print() == -1:
                sys.exit(0)
    else:
        binary_file_to_compare = opts.file_names[1]
        locstats_to_compare = parse_locstats(opts, binary_file_to_compare)
        # Draw a plot showing the difference in debug location coverage between
        # two files.
        locstats.draw_location_diff(locstats_to_compare)


if __name__ == "__main__":
    Main()
    sys.exit(0)