xref: /llvm-project/llvm/utils/filecheck_lint/filecheck_lint.py (revision 42ebf3eaafc2a5c3c9338020186c0ad44cc4edf7)
13a8d176aSJay Foad#!/usr/bin/env python3
2f702c759SBenjamin Chetioui# ===----------------------------------------------------------------------===##
3f702c759SBenjamin Chetioui#
4f702c759SBenjamin Chetioui# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5f702c759SBenjamin Chetioui# See https://llvm.org/LICENSE.txt for license information.
6f702c759SBenjamin Chetioui# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7f702c759SBenjamin Chetioui#
8f702c759SBenjamin Chetioui# ===----------------------------------------------------------------------===##
9f702c759SBenjamin Chetioui"""A linter that detects potential typos in FileCheck directive names.
10f702c759SBenjamin Chetioui
11f702c759SBenjamin ChetiouiConsider a broken test foo.cpp:
12f702c759SBenjamin Chetioui
13f702c759SBenjamin Chetioui// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW
14f702c759SBenjamin Chetioui// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD
15f702c759SBenjamin Chetiouiauto x = 42;
16f702c759SBenjamin Chetioui// NEWW: auto is a c++11 extension
17f702c759SBenjamin Chetioui// ODL-NOT: auto is a c++11 extension
18f702c759SBenjamin Chetioui
19f702c759SBenjamin ChetiouiWe first detect the locally valid FileCheck directive prefixes by parsing the
20f702c759SBenjamin Chetioui--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are
21f702c759SBenjamin Chetioui{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}.
22f702c759SBenjamin Chetioui
23f702c759SBenjamin ChetiouiThen we look for lines that look like directives. These are of the form 'FOO:',
24f702c759SBenjamin Chetiouiusually at the beginning of a line or a comment. If any of these are a
25f702c759SBenjamin Chetioui"near-miss" for a directive name, then we suspect this is a typo and report it.
26f702c759SBenjamin Chetioui
27f702c759SBenjamin ChetiouiUsage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n
28f702c759SBenjamin Chetioui"""
29f702c759SBenjamin Chetioui
30f702c759SBenjamin Chetiouiimport itertools
31f702c759SBenjamin Chetiouiimport logging
32f702c759SBenjamin Chetiouiimport pathlib
33f702c759SBenjamin Chetiouiimport re
34f702c759SBenjamin Chetiouiimport sys
35f702c759SBenjamin Chetiouifrom typing import Generator, Sequence, Tuple
36f702c759SBenjamin Chetioui
37f702c759SBenjamin Chetioui_distance_threshold = 3
38b71edfaaSTobias Hieta_prefixes = {"CHECK"}
39b71edfaaSTobias Hieta_suffixes = {"-DAG", "-COUNT", "-EMPTY", "-LABEL", "-NEXT", "-NOT", "-SAME"}
40f702c759SBenjamin Chetioui# 'NOTE' and 'TODO' are not directives, but are likely to be false positives
41f702c759SBenjamin Chetioui# if encountered and to generate noise as a result. We filter them out also to
42f702c759SBenjamin Chetioui# avoid this.
43f702c759SBenjamin Chetioui_lit_directives = {
44b71edfaaSTobias Hieta    "RUN",
45b71edfaaSTobias Hieta    "REQUIRES",
46b71edfaaSTobias Hieta    "UNSUPPORTED",
47b71edfaaSTobias Hieta    "XFAIL",
48b71edfaaSTobias Hieta    "DEFINE",
49b71edfaaSTobias Hieta    "REDEFINE",
50f702c759SBenjamin Chetioui}
51f702c759SBenjamin Chetioui# 'COM' and 'RUN' are default comment prefixes for FileCheck.
52b71edfaaSTobias Hieta_comment_prefixes = {"COM", "RUN"}
53b71edfaaSTobias Hieta_ignore = _lit_directives.union(_comment_prefixes).union({"NOTE", "TODO"})
54f702c759SBenjamin Chetioui
55f702c759SBenjamin Chetioui
56f702c759SBenjamin Chetiouidef levenshtein(s1: str, s2: str) -> int:  # pylint: disable=g-doc-args
57f702c759SBenjamin Chetioui    """Computes the edit distance between two strings.
58f702c759SBenjamin Chetioui
59f702c759SBenjamin Chetioui    Additions, deletions, and substitutions all count as a single operation.
60f702c759SBenjamin Chetioui    """
61f702c759SBenjamin Chetioui    if not s1:
62f702c759SBenjamin Chetioui        return len(s2)
63f702c759SBenjamin Chetioui    if not s2:
64f702c759SBenjamin Chetioui        return len(s1)
65f702c759SBenjamin Chetioui
66f702c759SBenjamin Chetioui    distances = range(len(s2) + 1)
67f702c759SBenjamin Chetioui    for i in range(len(s1)):
68f702c759SBenjamin Chetioui        new_distances = [i + 1]
69f702c759SBenjamin Chetioui        for j in range(len(s2)):
70b71edfaaSTobias Hieta            cost = min(
71b71edfaaSTobias Hieta                distances[j] + int(s1[i] != s2[j]),
72b71edfaaSTobias Hieta                distances[j + 1] + 1,
73b71edfaaSTobias Hieta                new_distances[-1] + 1,
74b71edfaaSTobias Hieta            )
75f702c759SBenjamin Chetioui            new_distances.append(cost)
76f702c759SBenjamin Chetioui        distances = new_distances
77f702c759SBenjamin Chetioui    return distances[-1]
78f702c759SBenjamin Chetioui
79f702c759SBenjamin Chetioui
80f702c759SBenjamin Chetiouiclass FileRange:
81f702c759SBenjamin Chetioui    """Stores the coordinates of a span on a single line within a file.
82f702c759SBenjamin Chetioui
83f702c759SBenjamin Chetioui    Attributes:
84*42ebf3eaSklensy      content:    line str
85*42ebf3eaSklensy      start_byte: the (inclusive) byte offset the span starts
86*42ebf3eaSklensy      end_byte:   the (inclusive) byte offset the span ends
87f702c759SBenjamin Chetioui    """
88b71edfaaSTobias Hieta
89*42ebf3eaSklensy    content: str
90*42ebf3eaSklensy    start_byte: int
91*42ebf3eaSklensy    end_byte: int
92f702c759SBenjamin Chetioui
93b71edfaaSTobias Hieta    def __init__(
94b71edfaaSTobias Hieta        self, content: str, start_byte: int, end_byte: int
95b71edfaaSTobias Hieta    ):  # pylint: disable=g-doc-args
96*42ebf3eaSklensy        """
97*42ebf3eaSklensy        Stores the coordinates of a span based on a string and start/end bytes.
98f702c759SBenjamin Chetioui
99f702c759SBenjamin Chetioui        `start_byte` and `end_byte` are assumed to be on the same line.
100f702c759SBenjamin Chetioui        """
101*42ebf3eaSklensy        self.content = content
102*42ebf3eaSklensy        self.start_byte = start_byte
103*42ebf3eaSklensy        self.end_byte = end_byte
104f702c759SBenjamin Chetioui
105*42ebf3eaSklensy    def as_str(self):
106*42ebf3eaSklensy        """
107*42ebf3eaSklensy        Derives span from line and coordinates.
108*42ebf3eaSklensy
109*42ebf3eaSklensy        start_column: the (inclusive) column where the span starts
110*42ebf3eaSklensy        end_column:   the (inclusive) column where the span ends
111*42ebf3eaSklensy        """
112*42ebf3eaSklensy        content_before_span = self.content[: self.start_byte]
113*42ebf3eaSklensy        line = content_before_span.count("\n") + 1
114*42ebf3eaSklensy        start_column = self.start_byte - content_before_span.rfind("\n")
115*42ebf3eaSklensy        end_column = start_column + (self.end_byte - self.start_byte - 1)
116*42ebf3eaSklensy
117*42ebf3eaSklensy        return f"{line}:{start_column}-{end_column}"
118f702c759SBenjamin Chetioui
119f702c759SBenjamin Chetioui
120f702c759SBenjamin Chetiouiclass Diagnostic:
121f702c759SBenjamin Chetioui    """Stores information about one typo and a suggested fix.
122f702c759SBenjamin Chetioui
123f702c759SBenjamin Chetioui    Attributes:
124f702c759SBenjamin Chetioui      filepath:   the path to the file in which the typo was found
125f702c759SBenjamin Chetioui      filerange:  the position at which the typo was found in the file
126f702c759SBenjamin Chetioui      typo:       the typo
127f702c759SBenjamin Chetioui      fix:        a suggested fix
128f702c759SBenjamin Chetioui    """
129f702c759SBenjamin Chetioui
130f702c759SBenjamin Chetioui    filepath: pathlib.Path
131f702c759SBenjamin Chetioui    filerange: FileRange
132f702c759SBenjamin Chetioui    typo: str
133f702c759SBenjamin Chetioui    fix: str
134f702c759SBenjamin Chetioui
135f702c759SBenjamin Chetioui    def __init__(
136f702c759SBenjamin Chetioui        self,
137f702c759SBenjamin Chetioui        filepath: pathlib.Path,
138f702c759SBenjamin Chetioui        filerange: FileRange,
139f702c759SBenjamin Chetioui        typo: str,
140b71edfaaSTobias Hieta        fix: str,  # pylint: disable=redefined-outer-name
141f702c759SBenjamin Chetioui    ):
142f702c759SBenjamin Chetioui        self.filepath = filepath
143f702c759SBenjamin Chetioui        self.filerange = filerange
144f702c759SBenjamin Chetioui        self.typo = typo
145f702c759SBenjamin Chetioui        self.fix = fix
146f702c759SBenjamin Chetioui
147f702c759SBenjamin Chetioui    def __str__(self) -> str:
148*42ebf3eaSklensy        return f"{self.filepath}:" + self.filerange.as_str() + f": {self.summary()}"
149f702c759SBenjamin Chetioui
150f702c759SBenjamin Chetioui    def summary(self) -> str:
151f702c759SBenjamin Chetioui        return (
152f702c759SBenjamin Chetioui            f'Found potentially misspelled directive "{self.typo}". Did you mean '
153b71edfaaSTobias Hieta            f'"{self.fix}"?'
154b71edfaaSTobias Hieta        )
155f702c759SBenjamin Chetioui
156f702c759SBenjamin Chetioui
157f702c759SBenjamin Chetiouidef find_potential_directives(
158b71edfaaSTobias Hieta    content: str,
159b71edfaaSTobias Hieta) -> Generator[Tuple[FileRange, str], None, None]:
160f702c759SBenjamin Chetioui    """Extracts all the potential FileCheck directives from a string.
161f702c759SBenjamin Chetioui
162f702c759SBenjamin Chetioui    What constitutes a potential directive is loosely defined---we err on the side
163f702c759SBenjamin Chetioui    of capturing more strings than is necessary, rather than missing any.
164f702c759SBenjamin Chetioui
165f702c759SBenjamin Chetioui    Args:
166f702c759SBenjamin Chetioui      content: the string in which to look for directives
167f702c759SBenjamin Chetioui
168f702c759SBenjamin Chetioui    Yields:
169f702c759SBenjamin Chetioui      Tuples (p, d) where p is the span where the potential directive occurs
170f702c759SBenjamin Chetioui      within the string and d is the potential directive.
171f702c759SBenjamin Chetioui    """
172f702c759SBenjamin Chetioui    directive_pattern = re.compile(
173b71edfaaSTobias Hieta        r"(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):", re.MULTILINE
174b71edfaaSTobias Hieta    )
175f702c759SBenjamin Chetioui    for match in re.finditer(directive_pattern, content):
176f702c759SBenjamin Chetioui        potential_directive, span = match.group(1), match.span(1)
177f702c759SBenjamin Chetioui        yield (FileRange(content, span[0], span[1]), potential_directive)
178f702c759SBenjamin Chetioui
179f702c759SBenjamin Chetioui
180f702c759SBenjamin Chetioui# TODO(bchetioui): also parse comment prefixes to ignore.
181b71edfaaSTobias Hietadef parse_custom_prefixes(
182b71edfaaSTobias Hieta    content: str,
183b71edfaaSTobias Hieta) -> Generator[str, None, None]:  # pylint: disable=g-doc-args
184f702c759SBenjamin Chetioui    """Parses custom prefixes defined in the string provided.
185f702c759SBenjamin Chetioui
186f702c759SBenjamin Chetioui    For example, given the following file content:
187f702c759SBenjamin Chetioui      RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2
188f702c759SBenjamin Chetioui      RUN: something_else | FileCheck %s -check-prefix 'CHECK3'
189f702c759SBenjamin Chetioui
190f702c759SBenjamin Chetioui    the custom prefixes are CHECK1, CHECK2, and CHECK3.
191f702c759SBenjamin Chetioui    """
192b71edfaaSTobias Hieta    param_re = r"|".join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+'])
193b71edfaaSTobias Hieta    for m in re.finditer(
194b71edfaaSTobias Hieta        r"-check-prefix(?:es)?(?:\s+|=)({})".format(param_re), content
195b71edfaaSTobias Hieta    ):
196f702c759SBenjamin Chetioui        prefixes = m.group(1)
197b71edfaaSTobias Hieta        if prefixes.startswith("'") or prefixes.startswith('"'):
198f702c759SBenjamin Chetioui            prefixes = prefixes[1:-1]
199b71edfaaSTobias Hieta        for prefix in prefixes.split(","):
200f702c759SBenjamin Chetioui            yield prefix
201f702c759SBenjamin Chetioui
202f702c759SBenjamin Chetioui
203f702c759SBenjamin Chetiouidef find_directive_typos(
204f702c759SBenjamin Chetioui    content: str,
205f702c759SBenjamin Chetioui    filepath: pathlib.Path,
206f702c759SBenjamin Chetioui    threshold: int = 3,
207f702c759SBenjamin Chetioui) -> Generator[Diagnostic, None, None]:
208f702c759SBenjamin Chetioui    """Detects potential typos in FileCheck directives.
209f702c759SBenjamin Chetioui
210f702c759SBenjamin Chetioui    Args:
211f702c759SBenjamin Chetioui      content: the content of the file
212f702c759SBenjamin Chetioui      filepath: the path to the file to check for typos in directives
213f702c759SBenjamin Chetioui      threshold: the (inclusive) maximum edit distance between a potential
214f702c759SBenjamin Chetioui        directive and an actual directive, such that the potential directive is
215f702c759SBenjamin Chetioui        classified as a typo
216f702c759SBenjamin Chetioui
217f702c759SBenjamin Chetioui    Yields:
218f702c759SBenjamin Chetioui      Diagnostics, in order from the top of the file.
219f702c759SBenjamin Chetioui    """
220f702c759SBenjamin Chetioui    all_prefixes = _prefixes.union(set(parse_custom_prefixes(content)))
221b71edfaaSTobias Hieta    all_directives = (
222b71edfaaSTobias Hieta        [
223b71edfaaSTobias Hieta            f"{prefix}{suffix}"
224f702c759SBenjamin Chetioui            for prefix, suffix in itertools.product(all_prefixes, _suffixes)
225b71edfaaSTobias Hieta        ]
226b71edfaaSTobias Hieta        + list(_ignore)
227b71edfaaSTobias Hieta        + list(all_prefixes)
228b71edfaaSTobias Hieta    )
229f702c759SBenjamin Chetioui
230f702c759SBenjamin Chetioui    def find_best_match(typo):
231f702c759SBenjamin Chetioui        return min(
232b71edfaaSTobias Hieta            [(threshold + 1, typo)]
233b71edfaaSTobias Hieta            + [
234b71edfaaSTobias Hieta                (levenshtein(typo, d), d)
235f702c759SBenjamin Chetioui                for d in all_directives
236b71edfaaSTobias Hieta                if abs(len(d) - len(typo)) <= threshold
237b71edfaaSTobias Hieta            ],
238f702c759SBenjamin Chetioui            key=lambda tup: tup[0],
239f702c759SBenjamin Chetioui        )
240f702c759SBenjamin Chetioui
241f702c759SBenjamin Chetioui    potential_directives = find_potential_directives(content)
242*42ebf3eaSklensy    # Cache score and best_match to skip recalculating.
243*42ebf3eaSklensy    score_and_best_match_for_potential_directive = dict()
244f702c759SBenjamin Chetioui    for filerange, potential_directive in potential_directives:
245f702c759SBenjamin Chetioui        # TODO(bchetioui): match count directives more finely. We skip directives
246f702c759SBenjamin Chetioui        # starting with 'CHECK-COUNT-' for the moment as they require more complex
247f702c759SBenjamin Chetioui        # logic to be handled correctly.
248f702c759SBenjamin Chetioui        if any(
249b71edfaaSTobias Hieta            potential_directive.startswith(f"{prefix}-COUNT-")
250b71edfaaSTobias Hieta            for prefix in all_prefixes
251b71edfaaSTobias Hieta        ):
252f702c759SBenjamin Chetioui            continue
253f702c759SBenjamin Chetioui
254f702c759SBenjamin Chetioui        # Ignoring potential typos that will not be matched later due to a too low
255f702c759SBenjamin Chetioui        # threshold, in order to avoid potentially long computation times.
256f702c759SBenjamin Chetioui        if len(potential_directive) > max(map(len, all_directives)) + threshold:
257f702c759SBenjamin Chetioui            continue
258f702c759SBenjamin Chetioui
259*42ebf3eaSklensy        if potential_directive not in score_and_best_match_for_potential_directive:
260f702c759SBenjamin Chetioui            score, best_match = find_best_match(potential_directive)
261*42ebf3eaSklensy            score_and_best_match_for_potential_directive[potential_directive] = (
262*42ebf3eaSklensy                score,
263*42ebf3eaSklensy                best_match,
264*42ebf3eaSklensy            )
265*42ebf3eaSklensy        else:
266*42ebf3eaSklensy            score, best_match = score_and_best_match_for_potential_directive[
267*42ebf3eaSklensy                potential_directive
268*42ebf3eaSklensy            ]
269f702c759SBenjamin Chetioui        if score == 0:  # This is an actual directive, ignore.
270f702c759SBenjamin Chetioui            continue
271f702c759SBenjamin Chetioui        elif score <= threshold and best_match not in _ignore:
272f702c759SBenjamin Chetioui            yield Diagnostic(filepath, filerange, potential_directive, best_match)
273f702c759SBenjamin Chetioui
274f702c759SBenjamin Chetioui
275f702c759SBenjamin Chetiouidef main(argv: Sequence[str]):
276f702c759SBenjamin Chetioui    if len(argv) < 2:
277b71edfaaSTobias Hieta        print(f"Usage: {argv[0]} path/to/file/1 ... path/to/file/n")
278f702c759SBenjamin Chetioui        exit(1)
279f702c759SBenjamin Chetioui
280f702c759SBenjamin Chetioui    for filepath in argv[1:]:
281b71edfaaSTobias Hieta        logging.info("Checking %s", filepath)
282b71edfaaSTobias Hieta        with open(filepath, "rt") as f:
283f702c759SBenjamin Chetioui            content = f.read()
284f702c759SBenjamin Chetioui        for diagnostic in find_directive_typos(
285f702c759SBenjamin Chetioui            content,
286f702c759SBenjamin Chetioui            pathlib.Path(filepath),
287f702c759SBenjamin Chetioui            threshold=_distance_threshold,
288f702c759SBenjamin Chetioui        ):
289f702c759SBenjamin Chetioui            print(diagnostic)
290f702c759SBenjamin Chetioui
291f702c759SBenjamin Chetioui
292b71edfaaSTobias Hietaif __name__ == "__main__":
293f702c759SBenjamin Chetioui    main(sys.argv)
294