utils/Reviewing/find_interesting_reviews.py

#!/usr/bin/env python

from __future__ import print_function

import argparse
import email.mime.multipart
import email.mime.text
import logging
import os.path
import pickle
import re
import smtplib
import subprocess
import sys
from datetime import datetime, timedelta
from phabricator import Phabricator

# Setting up a virtualenv to run this script can be done by running the
# following commands:
# $ virtualenv venv
# $ . ./venv/bin/activate
# $ pip install Phabricator

GIT_REPO_METADATA = (("llvm-monorepo", "https://github.com/llvm/llvm-project"),)

# The below PhabXXX classes represent objects as modelled by Phabricator.
# The classes can be serialized to disk, to try and make sure that we don't
# needlessly have to re-fetch lots of data from Phabricator, as that would
# make this script unusably slow.


class PhabObject:
    OBJECT_KIND = None

    def __init__(self, id):
        self.id = id


class PhabObjectCache:
    def __init__(self, PhabObjectClass):
        self.PhabObjectClass = PhabObjectClass
        self.most_recent_info = None
        self.oldest_info = None
        self.id2PhabObjects = {}

    def get_name(self):
        return self.PhabObjectClass.OBJECT_KIND + "sCache"

    def get(self, id):
        if id not in self.id2PhabObjects:
            self.id2PhabObjects[id] = self.PhabObjectClass(id)
        return self.id2PhabObjects[id]

    def get_ids_in_cache(self):
        return list(self.id2PhabObjects.keys())

    def get_objects(self):
        return list(self.id2PhabObjects.values())

    DEFAULT_DIRECTORY = "PhabObjectCache"

    def _get_pickle_name(self, directory):
        file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle"
        return os.path.join(directory, file_name)

    def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY):
        """
        FIXME: consider if serializing to JSON would bring interoperability
        advantages over serializing to pickle.
        """
        try:
            f = open(self._get_pickle_name(directory), "rb")
        except IOError as err:
            print("Could not find cache. Error message: {0}. Continuing...".format(err))
        else:
            with f:
                try:
                    d = pickle.load(f)
                    self.__dict__.update(d)
                except EOFError as err:
                    print(
                        "Cache seems to be corrupt. "
                        + "Not using cache. Error message: {0}".format(err)
                    )

    def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY):
        if not os.path.exists(directory):
            os.makedirs(directory)
        with open(self._get_pickle_name(directory), "wb") as f:
            pickle.dump(self.__dict__, f)
        print(
            "wrote cache to disk, most_recent_info= {0}".format(
                datetime.fromtimestamp(self.most_recent_info)
                if self.most_recent_info is not None
                else None
            )
        )


class PhabReview(PhabObject):
    OBJECT_KIND = "Review"

    def __init__(self, id):
        PhabObject.__init__(self, id)

    def update(self, title, dateCreated, dateModified, author):
        self.title = title
        self.dateCreated = dateCreated
        self.dateModified = dateModified
        self.author = author

    def setPhabDiffs(self, phabDiffs):
        self.phabDiffs = phabDiffs


class PhabUser(PhabObject):
    OBJECT_KIND = "User"

    def __init__(self, id):
        PhabObject.__init__(self, id)

    def update(self, phid, realName):
        self.phid = phid
        self.realName = realName


class PhabHunk:
    def __init__(self, rest_api_hunk):
        self.oldOffset = int(rest_api_hunk["oldOffset"])
        self.oldLength = int(rest_api_hunk["oldLength"])
        # self.actual_lines_changed_offset will contain the offsets of the
        # lines that were changed in this hunk.
        self.actual_lines_changed_offset = []
        offset = self.oldOffset
        inHunk = False
        hunkStart = -1
        contextLines = 3
        for line in rest_api_hunk["corpus"].split("\n"):
            if line.startswith("+"):
                # line is a new line that got introduced in this patch.
                # Do not record it as a changed line.
                if inHunk is False:
                    inHunk = True
                    hunkStart = max(self.oldOffset, offset - contextLines)
                continue
            if line.startswith("-"):
                # line was changed or removed from the older version of the
                # code. Record it as a changed line.
                if inHunk is False:
                    inHunk = True
                    hunkStart = max(self.oldOffset, offset - contextLines)
                offset += 1
                continue
            # line is a context line.
            if inHunk is True:
                inHunk = False
                hunkEnd = offset + contextLines
                self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
            offset += 1
        if inHunk is True:
            hunkEnd = offset + contextLines
            self.actual_lines_changed_offset.append((hunkStart, hunkEnd))

        # The above algorithm could result in adjacent or overlapping ranges
        # being recorded into self.actual_lines_changed_offset.
        # Merge the adjacent and overlapping ranges in there:
        t = []
        lastRange = None
        for start, end in self.actual_lines_changed_offset + [
            (sys.maxsize, sys.maxsize)
        ]:
            if lastRange is None:
                lastRange = (start, end)
            else:
                if lastRange[1] >= start:
                    lastRange = (lastRange[0], end)
                else:
                    t.append(lastRange)
                    lastRange = (start, end)
        self.actual_lines_changed_offset = t


class PhabChange:
    def __init__(self, rest_api_change):
        self.oldPath = rest_api_change["oldPath"]
        self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]]


class PhabDiff(PhabObject):
    OBJECT_KIND = "Diff"

    def __init__(self, id):
        PhabObject.__init__(self, id)

    def update(self, rest_api_results):
        self.revisionID = rest_api_results["revisionID"]
        self.dateModified = int(rest_api_results["dateModified"])
        self.dateCreated = int(rest_api_results["dateCreated"])
        self.changes = [PhabChange(c) for c in rest_api_results["changes"]]


class ReviewsCache(PhabObjectCache):
    def __init__(self):
        PhabObjectCache.__init__(self, PhabReview)


class UsersCache(PhabObjectCache):
    def __init__(self):
        PhabObjectCache.__init__(self, PhabUser)


reviews_cache = ReviewsCache()
users_cache = UsersCache()


def init_phab_connection():
    phab = Phabricator()
    phab.update_interfaces()
    return phab


def update_cached_info(
    phab,
    cache,
    phab_query,
    order,
    record_results,
    max_nr_entries_per_fetch,
    max_nr_days_to_cache,
):
    q = phab
    LIMIT = max_nr_entries_per_fetch
    for query_step in phab_query:
        q = getattr(q, query_step)
    results = q(order=order, limit=LIMIT)
    most_recent_info, oldest_info = record_results(cache, results, phab)
    oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - timedelta(
        days=max_nr_days_to_cache
    )
    most_recent_info_overall = most_recent_info
    cache.write_cache_to_disk()
    after = results["cursor"]["after"]
    print("after: {0!r}".format(after))
    print("most_recent_info: {0}".format(datetime.fromtimestamp(most_recent_info)))
    while (
        after is not None and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch
    ):
        need_more_older_data = (
            cache.oldest_info is None
            or datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch
        )
        print(
            (
                "need_more_older_data={0} cache.oldest_info={1} "
                + "oldest_info_to_fetch={2}"
            ).format(
                need_more_older_data,
                datetime.fromtimestamp(cache.oldest_info)
                if cache.oldest_info is not None
                else None,
                oldest_info_to_fetch,
            )
        )
        need_more_newer_data = (
            cache.most_recent_info is None or cache.most_recent_info < most_recent_info
        )
        print(
            (
                "need_more_newer_data={0} cache.most_recent_info={1} "
                + "most_recent_info={2}"
            ).format(need_more_newer_data, cache.most_recent_info, most_recent_info)
        )
        if not need_more_older_data and not need_more_newer_data:
            break
        results = q(order=order, after=after, limit=LIMIT)
        most_recent_info, oldest_info = record_results(cache, results, phab)
        after = results["cursor"]["after"]
        print("after: {0!r}".format(after))
        print("most_recent_info: {0}".format(datetime.fromtimestamp(most_recent_info)))
        cache.write_cache_to_disk()
    cache.most_recent_info = most_recent_info_overall
    if after is None:
        # We did fetch all records. Mark the cache to contain all info since
        # the start of time.
        oldest_info = 0
    cache.oldest_info = oldest_info
    cache.write_cache_to_disk()


def record_reviews(cache, reviews, phab):
    most_recent_info = None
    oldest_info = None
    for reviewInfo in reviews["data"]:
        if reviewInfo["type"] != "DREV":
            continue
        id = reviewInfo["id"]
        # phid = reviewInfo["phid"]
        dateModified = int(reviewInfo["fields"]["dateModified"])
        dateCreated = int(reviewInfo["fields"]["dateCreated"])
        title = reviewInfo["fields"]["title"]
        author = reviewInfo["fields"]["authorPHID"]
        phabReview = cache.get(id)
        if (
            "dateModified" not in phabReview.__dict__
            or dateModified > phabReview.dateModified
        ):
            diff_results = phab.differential.querydiffs(revisionIDs=[id])
            diff_ids = sorted(diff_results.keys())
            phabDiffs = []
            for diff_id in diff_ids:
                diffInfo = diff_results[diff_id]
                d = PhabDiff(diff_id)
                d.update(diffInfo)
                phabDiffs.append(d)
            phabReview.update(title, dateCreated, dateModified, author)
            phabReview.setPhabDiffs(phabDiffs)
            print(
                "Updated D{0} modified on {1} ({2} diffs)".format(
                    id, datetime.fromtimestamp(dateModified), len(phabDiffs)
                )
            )

        if most_recent_info is None:
            most_recent_info = dateModified
        elif most_recent_info < dateModified:
            most_recent_info = dateModified

        if oldest_info is None:
            oldest_info = dateModified
        elif oldest_info > dateModified:
            oldest_info = dateModified
    return most_recent_info, oldest_info


def record_users(cache, users, phab):
    most_recent_info = None
    oldest_info = None
    for info in users["data"]:
        if info["type"] != "USER":
            continue
        id = info["id"]
        phid = info["phid"]
        dateModified = int(info["fields"]["dateModified"])
        # dateCreated = int(info["fields"]["dateCreated"])
        realName = info["fields"]["realName"]
        phabUser = cache.get(id)
        phabUser.update(phid, realName)
        if most_recent_info is None:
            most_recent_info = dateModified
        elif most_recent_info < dateModified:
            most_recent_info = dateModified
        if oldest_info is None:
            oldest_info = dateModified
        elif oldest_info > dateModified:
            oldest_info = dateModified
    return most_recent_info, oldest_info


PHABCACHESINFO = (
    (
        reviews_cache,
        ("differential", "revision", "search"),
        "updated",
        record_reviews,
        5,
        7,
    ),
    (users_cache, ("user", "search"), "newest", record_users, 100, 1000),
)


def load_cache():
    for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO:
        cache.populate_cache_from_disk()
        print(
            "Loaded {0} nr entries: {1}".format(
                cache.get_name(), len(cache.get_ids_in_cache())
            )
        )
        print(
            "Loaded {0} has most recent info: {1}".format(
                cache.get_name(),
                datetime.fromtimestamp(cache.most_recent_info)
                if cache.most_recent_info is not None
                else None,
            )
        )


def update_cache(phab):
    load_cache()
    for (
        cache,
        phab_query,
        order,
        record_results,
        max_nr_entries_per_fetch,
        max_nr_days_to_cache,
    ) in PHABCACHESINFO:
        update_cached_info(
            phab,
            cache,
            phab_query,
            order,
            record_results,
            max_nr_entries_per_fetch,
            max_nr_days_to_cache,
        )
        ids_in_cache = cache.get_ids_in_cache()
        print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name()))
        cache.write_cache_to_disk()


def get_most_recent_reviews(days):
    newest_reviews = sorted(reviews_cache.get_objects(), key=lambda r: -r.dateModified)
    if len(newest_reviews) == 0:
        return newest_reviews
    most_recent_review_time = datetime.fromtimestamp(newest_reviews[0].dateModified)
    cut_off_date = most_recent_review_time - timedelta(days=days)
    result = []
    for review in newest_reviews:
        if datetime.fromtimestamp(review.dateModified) < cut_off_date:
            return result
        result.append(review)
    return result


# All of the above code is about fetching data from Phabricator and caching it
# on local disk. The below code contains the actual "business logic" for this
# script.

_userphid2realname = None


def get_real_name_from_author(user_phid):
    global _userphid2realname
    if _userphid2realname is None:
        _userphid2realname = {}
        for user in users_cache.get_objects():
            _userphid2realname[user.phid] = user.realName
    return _userphid2realname.get(user_phid, "unknown")


def print_most_recent_reviews(phab, days, filter_reviewers):
    msgs = []

    def add_msg(msg):
        msgs.append(msg)
        print(msg.encode("utf-8"))

    newest_reviews = get_most_recent_reviews(days)
    add_msg(
        "These are the reviews that look interesting to be reviewed. "
        + "The report below has 2 sections. The first "
        + "section is organized per review; the second section is organized "
        + "per potential reviewer.\n"
    )
    oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None
    oldest_datetime = (
        datetime.fromtimestamp(oldest_review.dateModified) if oldest_review else None
    )
    add_msg(
        (
            "The report below is based on analyzing the reviews that got "
            + "touched in the past {0} days (since {1}). "
            + "The script found {2} such reviews.\n"
        ).format(days, oldest_datetime, len(newest_reviews))
    )
    reviewer2reviews_and_scores = {}
    for i, review in enumerate(newest_reviews):
        matched_reviewers = find_reviewers_for_review(review)
        matched_reviewers = filter_reviewers(matched_reviewers)
        if len(matched_reviewers) == 0:
            continue
        add_msg(
            (
                "{0:>3}. https://reviews.llvm.org/D{1} by {2}\n     {3}\n"
                + "     Last updated on {4}"
            ).format(
                i,
                review.id,
                get_real_name_from_author(review.author),
                review.title,
                datetime.fromtimestamp(review.dateModified),
            )
        )
        for reviewer, scores in matched_reviewers:
            add_msg(
                "    potential reviewer {0}, score {1}".format(
                    reviewer,
                    "(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")",
                )
            )
            if reviewer not in reviewer2reviews_and_scores:
                reviewer2reviews_and_scores[reviewer] = []
            reviewer2reviews_and_scores[reviewer].append((review, scores))

    # Print out a summary per reviewer.
    for reviewer in sorted(reviewer2reviews_and_scores.keys()):
        reviews_and_scores = reviewer2reviews_and_scores[reviewer]
        reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True)
        add_msg(
            "\n\nSUMMARY FOR {0} (found {1} reviews):".format(
                reviewer, len(reviews_and_scores)
            )
        )
        for review, scores in reviews_and_scores:
            add_msg(
                "[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
                    "/".join(["{0:.1f}%".format(s) for s in scores]),
                    review.id,
                    review.title,
                    get_real_name_from_author(review.author),
                )
            )
    return "\n".join(msgs)


def get_git_cmd_output(cmd):
    output = None
    try:
        logging.debug(cmd)
        output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        logging.debug(str(e))
    if output is None:
        return None
    return output.decode("utf-8", errors="ignore")


reAuthorMail = re.compile("^author-mail <([^>]*)>.*$")


def parse_blame_output_line_porcelain(blame_output_lines):
    email2nr_occurences = {}
    if blame_output_lines is None:
        return email2nr_occurences
    for line in blame_output_lines:
        m = reAuthorMail.match(line)
        if m:
            author_email_address = m.group(1)
            if author_email_address not in email2nr_occurences:
                email2nr_occurences[author_email_address] = 1
            else:
                email2nr_occurences[author_email_address] += 1
    return email2nr_occurences


class BlameOutputCache:
    def __init__(self):
        self.cache = {}

    def _populate_cache_for(self, cache_key):
        assert cache_key not in self.cache
        git_repo, base_revision, path = cache_key
        cmd = (
            "git -C {0} blame --encoding=utf-8 --date iso -f -e -w "
            + "--line-porcelain {1} -- {2}"
        ).format(git_repo, base_revision, path)
        blame_output = get_git_cmd_output(cmd)
        self.cache[cache_key] = (
            blame_output.split("\n") if blame_output is not None else None
        )
        # FIXME: the blame cache could probably be made more effective still if
        # instead of storing the requested base_revision in the cache, the last
        # revision before the base revision this file/path got changed in gets
        # stored. That way multiple project revisions for which this specific
        # file/patch hasn't changed would get cache hits (instead of misses in
        # the current implementation).

    def get_blame_output_for(
        self, git_repo, base_revision, path, start_line=-1, end_line=-1
    ):
        cache_key = (git_repo, base_revision, path)
        if cache_key not in self.cache:
            self._populate_cache_for(cache_key)
        assert cache_key in self.cache
        all_blame_lines = self.cache[cache_key]
        if all_blame_lines is None:
            return None
        if start_line == -1 and end_line == -1:
            return all_blame_lines
        assert start_line >= 0
        assert end_line >= 0
        assert end_line <= len(all_blame_lines)
        assert start_line <= len(all_blame_lines)
        assert start_line <= end_line
        return all_blame_lines[start_line:end_line]

    def get_parsed_git_blame_for(
        self, git_repo, base_revision, path, start_line=-1, end_line=-1
    ):
        return parse_blame_output_line_porcelain(
            self.get_blame_output_for(
                git_repo, base_revision, path, start_line, end_line
            )
        )


blameOutputCache = BlameOutputCache()


def find_reviewers_for_diff_heuristic(diff):
    # Heuristic 1: assume good reviewers are the ones that touched the same
    # lines before as this patch is touching.
    # Heuristic 2: assume good reviewers are the ones that touched the same
    # files before as this patch is touching.
    reviewers2nr_lines_touched = {}
    reviewers2nr_files_touched = {}
    # Assume last revision before diff was modified is the revision the diff
    # applies to.
    assert len(GIT_REPO_METADATA) == 1
    git_repo = os.path.join("git_repos", GIT_REPO_METADATA[0][0])
    cmd = 'git -C {0} rev-list -n 1 --before="{1}" main'.format(
        git_repo,
        datetime.fromtimestamp(diff.dateModified).strftime("%Y-%m-%d %H:%M:%s"),
    )
    base_revision = get_git_cmd_output(cmd).strip()
    logging.debug("Base revision={0}".format(base_revision))
    for change in diff.changes:
        path = change.oldPath
        # Compute heuristic 1: look at context of patch lines.
        for hunk in change.hunks:
            for start_line, end_line in hunk.actual_lines_changed_offset:
                # Collect git blame results for authors in those ranges.
                for (
                    reviewer,
                    nr_occurences,
                ) in blameOutputCache.get_parsed_git_blame_for(
                    git_repo, base_revision, path, start_line, end_line
                ).items():
                    if reviewer not in reviewers2nr_lines_touched:
                        reviewers2nr_lines_touched[reviewer] = 0
                    reviewers2nr_lines_touched[reviewer] += nr_occurences
        # Compute heuristic 2: don't look at context, just at files touched.
        # Collect git blame results for authors in those ranges.
        for reviewer, nr_occurences in blameOutputCache.get_parsed_git_blame_for(
            git_repo, base_revision, path
        ).items():
            if reviewer not in reviewers2nr_files_touched:
                reviewers2nr_files_touched[reviewer] = 0
            reviewers2nr_files_touched[reviewer] += 1

    # Compute "match scores"
    total_nr_lines = sum(reviewers2nr_lines_touched.values())
    total_nr_files = len(diff.changes)
    reviewers_matchscores = [
        (
            reviewer,
            (
                reviewers2nr_lines_touched.get(reviewer, 0) * 100.0 / total_nr_lines
                if total_nr_lines != 0
                else 0,
                reviewers2nr_files_touched[reviewer] * 100.0 / total_nr_files
                if total_nr_files != 0
                else 0,
            ),
        )
        for reviewer, nr_lines in reviewers2nr_files_touched.items()
    ]
    reviewers_matchscores.sort(key=lambda i: i[1], reverse=True)
    return reviewers_matchscores


def find_reviewers_for_review(review):
    # Process the newest diff first.
    diffs = sorted(review.phabDiffs, key=lambda d: d.dateModified, reverse=True)
    if len(diffs) == 0:
        return
    diff = diffs[0]
    matched_reviewers = find_reviewers_for_diff_heuristic(diff)
    # Show progress, as this is a slow operation:
    sys.stdout.write(".")
    sys.stdout.flush()
    logging.debug("matched_reviewers: {0}".format(matched_reviewers))
    return matched_reviewers


def update_git_repos():
    git_repos_directory = "git_repos"
    for name, url in GIT_REPO_METADATA:
        dirname = os.path.join(git_repos_directory, name)
        if not os.path.exists(dirname):
            cmd = "git clone {0} {1}".format(url, dirname)
            output = get_git_cmd_output(cmd)
        cmd = "git -C {0} pull --rebase".format(dirname)
        output = get_git_cmd_output(cmd)


def send_emails(email_addresses, sender, msg):
    s = smtplib.SMTP()
    s.connect()
    for email_address in email_addresses:
        email_msg = email.mime.multipart.MIMEMultipart()
        email_msg["From"] = sender
        email_msg["To"] = email_address
        email_msg["Subject"] = "LLVM patches you may be able to review."
        email_msg.attach(email.mime.text.MIMEText(msg.encode("utf-8"), "plain"))
        # python 3.x: s.send_message(email_msg)
        s.sendmail(email_msg["From"], email_msg["To"], email_msg.as_string())
    s.quit()


def filter_reviewers_to_report_for(people_to_look_for):
    # The below is just an example filter, to only report potential reviews
    # to do for the people that will receive the report email.
    return lambda potential_reviewers: [
        r for r in potential_reviewers if r[0] in people_to_look_for
    ]


def main():
    parser = argparse.ArgumentParser(
        description="Match open reviews to potential reviewers."
    )
    parser.add_argument(
        "--no-update-cache",
        dest="update_cache",
        action="store_false",
        default=True,
        help="Do not update cached Phabricator objects",
    )
    parser.add_argument(
        "--email-report",
        dest="email_report",
        nargs="*",
        default="",
        help="A email addresses to send the report to.",
    )
    parser.add_argument(
        "--sender",
        dest="sender",
        default="",
        help="The email address to use in 'From' on messages emailed out.",
    )
    parser.add_argument(
        "--email-addresses",
        dest="email_addresses",
        nargs="*",
        help="The email addresses (as known by LLVM git) of "
        + "the people to look for reviews for.",
    )
    parser.add_argument("--verbose", "-v", action="count")

    args = parser.parse_args()

    if args.verbose >= 1:
        logging.basicConfig(level=logging.DEBUG)

    people_to_look_for = [e.decode("utf-8") for e in args.email_addresses]
    logging.debug(
        "Will look for reviews that following contributors could "
        + "review: {}".format(people_to_look_for)
    )
    logging.debug("Will email a report to: {}".format(args.email_report))

    phab = init_phab_connection()

    if args.update_cache:
        update_cache(phab)

    load_cache()
    update_git_repos()
    msg = print_most_recent_reviews(
        phab,
        days=1,
        filter_reviewers=filter_reviewers_to_report_for(people_to_look_for),
    )

    if args.email_report != []:
        send_emails(args.email_report, args.sender, msg)


if __name__ == "__main__":
    main()