xref: /llvm-project/llvm/utils/filecheck_lint/filecheck_lint.py (revision 42ebf3eaafc2a5c3c9338020186c0ad44cc4edf7)
1#!/usr/bin/env python3
2# ===----------------------------------------------------------------------===##
3#
4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5# See https://llvm.org/LICENSE.txt for license information.
6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7#
8# ===----------------------------------------------------------------------===##
9"""A linter that detects potential typos in FileCheck directive names.
10
11Consider a broken test foo.cpp:
12
13// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW
14// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD
15auto x = 42;
16// NEWW: auto is a c++11 extension
17// ODL-NOT: auto is a c++11 extension
18
19We first detect the locally valid FileCheck directive prefixes by parsing the
20--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are
21{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}.
22
23Then we look for lines that look like directives. These are of the form 'FOO:',
24usually at the beginning of a line or a comment. If any of these are a
25"near-miss" for a directive name, then we suspect this is a typo and report it.
26
27Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n
28"""
29
30import itertools
31import logging
32import pathlib
33import re
34import sys
35from typing import Generator, Sequence, Tuple
36
37_distance_threshold = 3
38_prefixes = {"CHECK"}
39_suffixes = {"-DAG", "-COUNT", "-EMPTY", "-LABEL", "-NEXT", "-NOT", "-SAME"}
40# 'NOTE' and 'TODO' are not directives, but are likely to be false positives
41# if encountered and to generate noise as a result. We filter them out also to
42# avoid this.
43_lit_directives = {
44    "RUN",
45    "REQUIRES",
46    "UNSUPPORTED",
47    "XFAIL",
48    "DEFINE",
49    "REDEFINE",
50}
51# 'COM' and 'RUN' are default comment prefixes for FileCheck.
52_comment_prefixes = {"COM", "RUN"}
53_ignore = _lit_directives.union(_comment_prefixes).union({"NOTE", "TODO"})
54
55
56def levenshtein(s1: str, s2: str) -> int:  # pylint: disable=g-doc-args
57    """Computes the edit distance between two strings.
58
59    Additions, deletions, and substitutions all count as a single operation.
60    """
61    if not s1:
62        return len(s2)
63    if not s2:
64        return len(s1)
65
66    distances = range(len(s2) + 1)
67    for i in range(len(s1)):
68        new_distances = [i + 1]
69        for j in range(len(s2)):
70            cost = min(
71                distances[j] + int(s1[i] != s2[j]),
72                distances[j + 1] + 1,
73                new_distances[-1] + 1,
74            )
75            new_distances.append(cost)
76        distances = new_distances
77    return distances[-1]
78
79
80class FileRange:
81    """Stores the coordinates of a span on a single line within a file.
82
83    Attributes:
84      content:    line str
85      start_byte: the (inclusive) byte offset the span starts
86      end_byte:   the (inclusive) byte offset the span ends
87    """
88
89    content: str
90    start_byte: int
91    end_byte: int
92
93    def __init__(
94        self, content: str, start_byte: int, end_byte: int
95    ):  # pylint: disable=g-doc-args
96        """
97        Stores the coordinates of a span based on a string and start/end bytes.
98
99        `start_byte` and `end_byte` are assumed to be on the same line.
100        """
101        self.content = content
102        self.start_byte = start_byte
103        self.end_byte = end_byte
104
105    def as_str(self):
106        """
107        Derives span from line and coordinates.
108
109        start_column: the (inclusive) column where the span starts
110        end_column:   the (inclusive) column where the span ends
111        """
112        content_before_span = self.content[: self.start_byte]
113        line = content_before_span.count("\n") + 1
114        start_column = self.start_byte - content_before_span.rfind("\n")
115        end_column = start_column + (self.end_byte - self.start_byte - 1)
116
117        return f"{line}:{start_column}-{end_column}"
118
119
120class Diagnostic:
121    """Stores information about one typo and a suggested fix.
122
123    Attributes:
124      filepath:   the path to the file in which the typo was found
125      filerange:  the position at which the typo was found in the file
126      typo:       the typo
127      fix:        a suggested fix
128    """
129
130    filepath: pathlib.Path
131    filerange: FileRange
132    typo: str
133    fix: str
134
135    def __init__(
136        self,
137        filepath: pathlib.Path,
138        filerange: FileRange,
139        typo: str,
140        fix: str,  # pylint: disable=redefined-outer-name
141    ):
142        self.filepath = filepath
143        self.filerange = filerange
144        self.typo = typo
145        self.fix = fix
146
147    def __str__(self) -> str:
148        return f"{self.filepath}:" + self.filerange.as_str() + f": {self.summary()}"
149
150    def summary(self) -> str:
151        return (
152            f'Found potentially misspelled directive "{self.typo}". Did you mean '
153            f'"{self.fix}"?'
154        )
155
156
157def find_potential_directives(
158    content: str,
159) -> Generator[Tuple[FileRange, str], None, None]:
160    """Extracts all the potential FileCheck directives from a string.
161
162    What constitutes a potential directive is loosely defined---we err on the side
163    of capturing more strings than is necessary, rather than missing any.
164
165    Args:
166      content: the string in which to look for directives
167
168    Yields:
169      Tuples (p, d) where p is the span where the potential directive occurs
170      within the string and d is the potential directive.
171    """
172    directive_pattern = re.compile(
173        r"(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):", re.MULTILINE
174    )
175    for match in re.finditer(directive_pattern, content):
176        potential_directive, span = match.group(1), match.span(1)
177        yield (FileRange(content, span[0], span[1]), potential_directive)
178
179
180# TODO(bchetioui): also parse comment prefixes to ignore.
181def parse_custom_prefixes(
182    content: str,
183) -> Generator[str, None, None]:  # pylint: disable=g-doc-args
184    """Parses custom prefixes defined in the string provided.
185
186    For example, given the following file content:
187      RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2
188      RUN: something_else | FileCheck %s -check-prefix 'CHECK3'
189
190    the custom prefixes are CHECK1, CHECK2, and CHECK3.
191    """
192    param_re = r"|".join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+'])
193    for m in re.finditer(
194        r"-check-prefix(?:es)?(?:\s+|=)({})".format(param_re), content
195    ):
196        prefixes = m.group(1)
197        if prefixes.startswith("'") or prefixes.startswith('"'):
198            prefixes = prefixes[1:-1]
199        for prefix in prefixes.split(","):
200            yield prefix
201
202
203def find_directive_typos(
204    content: str,
205    filepath: pathlib.Path,
206    threshold: int = 3,
207) -> Generator[Diagnostic, None, None]:
208    """Detects potential typos in FileCheck directives.
209
210    Args:
211      content: the content of the file
212      filepath: the path to the file to check for typos in directives
213      threshold: the (inclusive) maximum edit distance between a potential
214        directive and an actual directive, such that the potential directive is
215        classified as a typo
216
217    Yields:
218      Diagnostics, in order from the top of the file.
219    """
220    all_prefixes = _prefixes.union(set(parse_custom_prefixes(content)))
221    all_directives = (
222        [
223            f"{prefix}{suffix}"
224            for prefix, suffix in itertools.product(all_prefixes, _suffixes)
225        ]
226        + list(_ignore)
227        + list(all_prefixes)
228    )
229
230    def find_best_match(typo):
231        return min(
232            [(threshold + 1, typo)]
233            + [
234                (levenshtein(typo, d), d)
235                for d in all_directives
236                if abs(len(d) - len(typo)) <= threshold
237            ],
238            key=lambda tup: tup[0],
239        )
240
241    potential_directives = find_potential_directives(content)
242    # Cache score and best_match to skip recalculating.
243    score_and_best_match_for_potential_directive = dict()
244    for filerange, potential_directive in potential_directives:
245        # TODO(bchetioui): match count directives more finely. We skip directives
246        # starting with 'CHECK-COUNT-' for the moment as they require more complex
247        # logic to be handled correctly.
248        if any(
249            potential_directive.startswith(f"{prefix}-COUNT-")
250            for prefix in all_prefixes
251        ):
252            continue
253
254        # Ignoring potential typos that will not be matched later due to a too low
255        # threshold, in order to avoid potentially long computation times.
256        if len(potential_directive) > max(map(len, all_directives)) + threshold:
257            continue
258
259        if potential_directive not in score_and_best_match_for_potential_directive:
260            score, best_match = find_best_match(potential_directive)
261            score_and_best_match_for_potential_directive[potential_directive] = (
262                score,
263                best_match,
264            )
265        else:
266            score, best_match = score_and_best_match_for_potential_directive[
267                potential_directive
268            ]
269        if score == 0:  # This is an actual directive, ignore.
270            continue
271        elif score <= threshold and best_match not in _ignore:
272            yield Diagnostic(filepath, filerange, potential_directive, best_match)
273
274
275def main(argv: Sequence[str]):
276    if len(argv) < 2:
277        print(f"Usage: {argv[0]} path/to/file/1 ... path/to/file/n")
278        exit(1)
279
280    for filepath in argv[1:]:
281        logging.info("Checking %s", filepath)
282        with open(filepath, "rt") as f:
283            content = f.read()
284        for diagnostic in find_directive_typos(
285            content,
286            pathlib.Path(filepath),
287            threshold=_distance_threshold,
288        ):
289            print(diagnostic)
290
291
292if __name__ == "__main__":
293    main(sys.argv)
294