xref: /llvm-project/llvm/utils/filecheck_lint/filecheck_lint.py (revision b71edfaa4ec3c998aadb35255ce2f60bba2940b0)
1# ===----------------------------------------------------------------------===##
2#
3# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4# See https://llvm.org/LICENSE.txt for license information.
5# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6#
7# ===----------------------------------------------------------------------===##
8"""A linter that detects potential typos in FileCheck directive names.
9
10Consider a broken test foo.cpp:
11
12// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW
13// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD
14auto x = 42;
15// NEWW: auto is a c++11 extension
16// ODL-NOT: auto is a c++11 extension
17
18We first detect the locally valid FileCheck directive prefixes by parsing the
19--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are
20{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}.
21
22Then we look for lines that look like directives. These are of the form 'FOO:',
23usually at the beginning of a line or a comment. If any of these are a
24"near-miss" for a directive name, then we suspect this is a typo and report it.
25
26Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n
27"""
28
29import itertools
30import logging
31import pathlib
32import re
33import sys
34from typing import Generator, Sequence, Tuple
35
36_distance_threshold = 3
37_prefixes = {"CHECK"}
38_suffixes = {"-DAG", "-COUNT", "-EMPTY", "-LABEL", "-NEXT", "-NOT", "-SAME"}
39# 'NOTE' and 'TODO' are not directives, but are likely to be false positives
40# if encountered and to generate noise as a result. We filter them out also to
41# avoid this.
42_lit_directives = {
43    "RUN",
44    "REQUIRES",
45    "UNSUPPORTED",
46    "XFAIL",
47    "DEFINE",
48    "REDEFINE",
49}
50# 'COM' and 'RUN' are default comment prefixes for FileCheck.
51_comment_prefixes = {"COM", "RUN"}
52_ignore = _lit_directives.union(_comment_prefixes).union({"NOTE", "TODO"})
53
54
55def levenshtein(s1: str, s2: str) -> int:  # pylint: disable=g-doc-args
56    """Computes the edit distance between two strings.
57
58    Additions, deletions, and substitutions all count as a single operation.
59    """
60    if not s1:
61        return len(s2)
62    if not s2:
63        return len(s1)
64
65    distances = range(len(s2) + 1)
66    for i in range(len(s1)):
67        new_distances = [i + 1]
68        for j in range(len(s2)):
69            cost = min(
70                distances[j] + int(s1[i] != s2[j]),
71                distances[j + 1] + 1,
72                new_distances[-1] + 1,
73            )
74            new_distances.append(cost)
75        distances = new_distances
76    return distances[-1]
77
78
79class FileRange:
80    """Stores the coordinates of a span on a single line within a file.
81
82    Attributes:
83      line:         the line number
84      start_column: the (inclusive) column where the span starts
85      end_column:   the (inclusive) column where the span ends
86    """
87
88    line: int
89    start_column: int
90    end_column: int
91
92    def __init__(
93        self, content: str, start_byte: int, end_byte: int
94    ):  # pylint: disable=g-doc-args
95        """Derives a span's coordinates based on a string and start/end bytes.
96
97        `start_byte` and `end_byte` are assumed to be on the same line.
98        """
99        content_before_span = content[:start_byte]
100        self.line = content_before_span.count("\n") + 1
101        self.start_column = start_byte - content_before_span.rfind("\n")
102        self.end_column = self.start_column + (end_byte - start_byte - 1)
103
104    def __str__(self) -> str:
105        return f"{self.line}:{self.start_column}-{self.end_column}"
106
107
108class Diagnostic:
109    """Stores information about one typo and a suggested fix.
110
111    Attributes:
112      filepath:   the path to the file in which the typo was found
113      filerange:  the position at which the typo was found in the file
114      typo:       the typo
115      fix:        a suggested fix
116    """
117
118    filepath: pathlib.Path
119    filerange: FileRange
120    typo: str
121    fix: str
122
123    def __init__(
124        self,
125        filepath: pathlib.Path,
126        filerange: FileRange,
127        typo: str,
128        fix: str,  # pylint: disable=redefined-outer-name
129    ):
130        self.filepath = filepath
131        self.filerange = filerange
132        self.typo = typo
133        self.fix = fix
134
135    def __str__(self) -> str:
136        return f"{self.filepath}:" + str(self.filerange) + f": {self.summary()}"
137
138    def summary(self) -> str:
139        return (
140            f'Found potentially misspelled directive "{self.typo}". Did you mean '
141            f'"{self.fix}"?'
142        )
143
144
145def find_potential_directives(
146    content: str,
147) -> Generator[Tuple[FileRange, str], None, None]:
148    """Extracts all the potential FileCheck directives from a string.
149
150    What constitutes a potential directive is loosely defined---we err on the side
151    of capturing more strings than is necessary, rather than missing any.
152
153    Args:
154      content: the string in which to look for directives
155
156    Yields:
157      Tuples (p, d) where p is the span where the potential directive occurs
158      within the string and d is the potential directive.
159    """
160    directive_pattern = re.compile(
161        r"(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):", re.MULTILINE
162    )
163    for match in re.finditer(directive_pattern, content):
164        potential_directive, span = match.group(1), match.span(1)
165        yield (FileRange(content, span[0], span[1]), potential_directive)
166
167
168# TODO(bchetioui): also parse comment prefixes to ignore.
169def parse_custom_prefixes(
170    content: str,
171) -> Generator[str, None, None]:  # pylint: disable=g-doc-args
172    """Parses custom prefixes defined in the string provided.
173
174    For example, given the following file content:
175      RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2
176      RUN: something_else | FileCheck %s -check-prefix 'CHECK3'
177
178    the custom prefixes are CHECK1, CHECK2, and CHECK3.
179    """
180    param_re = r"|".join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+'])
181    for m in re.finditer(
182        r"-check-prefix(?:es)?(?:\s+|=)({})".format(param_re), content
183    ):
184        prefixes = m.group(1)
185        if prefixes.startswith("'") or prefixes.startswith('"'):
186            prefixes = prefixes[1:-1]
187        for prefix in prefixes.split(","):
188            yield prefix
189
190
191def find_directive_typos(
192    content: str,
193    filepath: pathlib.Path,
194    threshold: int = 3,
195) -> Generator[Diagnostic, None, None]:
196    """Detects potential typos in FileCheck directives.
197
198    Args:
199      content: the content of the file
200      filepath: the path to the file to check for typos in directives
201      threshold: the (inclusive) maximum edit distance between a potential
202        directive and an actual directive, such that the potential directive is
203        classified as a typo
204
205    Yields:
206      Diagnostics, in order from the top of the file.
207    """
208    all_prefixes = _prefixes.union(set(parse_custom_prefixes(content)))
209    all_directives = (
210        [
211            f"{prefix}{suffix}"
212            for prefix, suffix in itertools.product(all_prefixes, _suffixes)
213        ]
214        + list(_ignore)
215        + list(all_prefixes)
216    )
217
218    def find_best_match(typo):
219        return min(
220            [(threshold + 1, typo)]
221            + [
222                (levenshtein(typo, d), d)
223                for d in all_directives
224                if abs(len(d) - len(typo)) <= threshold
225            ],
226            key=lambda tup: tup[0],
227        )
228
229    potential_directives = find_potential_directives(content)
230
231    for filerange, potential_directive in potential_directives:
232        # TODO(bchetioui): match count directives more finely. We skip directives
233        # starting with 'CHECK-COUNT-' for the moment as they require more complex
234        # logic to be handled correctly.
235        if any(
236            potential_directive.startswith(f"{prefix}-COUNT-")
237            for prefix in all_prefixes
238        ):
239            continue
240
241        # Ignoring potential typos that will not be matched later due to a too low
242        # threshold, in order to avoid potentially long computation times.
243        if len(potential_directive) > max(map(len, all_directives)) + threshold:
244            continue
245
246        score, best_match = find_best_match(potential_directive)
247        if score == 0:  # This is an actual directive, ignore.
248            continue
249        elif score <= threshold and best_match not in _ignore:
250            yield Diagnostic(filepath, filerange, potential_directive, best_match)
251
252
253def main(argv: Sequence[str]):
254    if len(argv) < 2:
255        print(f"Usage: {argv[0]} path/to/file/1 ... path/to/file/n")
256        exit(1)
257
258    for filepath in argv[1:]:
259        logging.info("Checking %s", filepath)
260        with open(filepath, "rt") as f:
261            content = f.read()
262        for diagnostic in find_directive_typos(
263            content,
264            pathlib.Path(filepath),
265            threshold=_distance_threshold,
266        ):
267            print(diagnostic)
268
269
270if __name__ == "__main__":
271    main(sys.argv)
272