xref: /llvm-project/llvm/utils/filecheck_lint/filecheck_lint.py (revision f702c75995db9ea3978d39e3f3885b2113356f68)
1# ===----------------------------------------------------------------------===##
2#
3# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4# See https://llvm.org/LICENSE.txt for license information.
5# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6#
7# ===----------------------------------------------------------------------===##
8"""A linter that detects potential typos in FileCheck directive names.
9
10Consider a broken test foo.cpp:
11
12// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW
13// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD
14auto x = 42;
15// NEWW: auto is a c++11 extension
16// ODL-NOT: auto is a c++11 extension
17
18We first detect the locally valid FileCheck directive prefixes by parsing the
19--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are
20{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}.
21
22Then we look for lines that look like directives. These are of the form 'FOO:',
23usually at the beginning of a line or a comment. If any of these are a
24"near-miss" for a directive name, then we suspect this is a typo and report it.
25
26Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n
27"""
28
29import itertools
30import logging
31import pathlib
32import re
33import sys
34from typing import Generator, Sequence, Tuple
35
36_distance_threshold = 3
37_prefixes = {'CHECK'}
38_suffixes = {'-DAG', '-COUNT', '-EMPTY', '-LABEL', '-NEXT', '-NOT', '-SAME'}
39# 'NOTE' and 'TODO' are not directives, but are likely to be false positives
40# if encountered and to generate noise as a result. We filter them out also to
41# avoid this.
42_lit_directives = {
43    'RUN',
44    'REQUIRES',
45    'UNSUPPORTED',
46    'XFAIL',
47    'DEFINE',
48    'REDEFINE',
49}
50# 'COM' and 'RUN' are default comment prefixes for FileCheck.
51_comment_prefixes = {'COM', 'RUN'}
52_ignore = _lit_directives.union(_comment_prefixes).union({'NOTE', 'TODO'})
53
54
55def levenshtein(s1: str, s2: str) -> int:  # pylint: disable=g-doc-args
56  """Computes the edit distance between two strings.
57
58  Additions, deletions, and substitutions all count as a single operation.
59  """
60  if not s1:
61    return len(s2)
62  if not s2:
63    return len(s1)
64
65  distances = range(len(s2) + 1)
66  for i in range(len(s1)):
67    new_distances = [i + 1]
68    for j in range(len(s2)):
69      cost = min(distances[j] + int(s1[i] != s2[j]), distances[j + 1] + 1,
70                 new_distances[-1] + 1)
71      new_distances.append(cost)
72    distances = new_distances
73  return distances[-1]
74
75
76class FileRange:
77  """Stores the coordinates of a span on a single line within a file.
78
79  Attributes:
80    line:         the line number
81    start_column: the (inclusive) column where the span starts
82    end_column:   the (inclusive) column where the span ends
83  """
84  line: int
85  start_column: int
86  end_column: int
87
88  def __init__(self, content: str, start_byte: int, end_byte: int):  # pylint: disable=g-doc-args
89    """Derives a span's coordinates based on a string and start/end bytes.
90
91    `start_byte` and `end_byte` are assumed to be on the same line.
92    """
93    content_before_span = content[:start_byte]
94    self.line = content_before_span.count('\n') + 1
95    self.start_column = start_byte - content_before_span.rfind('\n')
96    self.end_column = self.start_column + (end_byte - start_byte - 1)
97
98  def __str__(self) -> str:
99    return f'{self.line}:{self.start_column}-{self.end_column}'
100
101
102class Diagnostic:
103  """Stores information about one typo and a suggested fix.
104
105  Attributes:
106    filepath:   the path to the file in which the typo was found
107    filerange:  the position at which the typo was found in the file
108    typo:       the typo
109    fix:        a suggested fix
110  """
111
112  filepath: pathlib.Path
113  filerange: FileRange
114  typo: str
115  fix: str
116
117  def __init__(
118      self,
119      filepath: pathlib.Path,
120      filerange: FileRange,
121      typo: str,
122      fix: str  # pylint: disable=redefined-outer-name
123  ):
124    self.filepath = filepath
125    self.filerange = filerange
126    self.typo = typo
127    self.fix = fix
128
129  def __str__(self) -> str:
130    return f'{self.filepath}:' + str(self.filerange) + f': {self.summary()}'
131
132  def summary(self) -> str:
133    return (
134        f'Found potentially misspelled directive "{self.typo}". Did you mean '
135        f'"{self.fix}"?')
136
137
138def find_potential_directives(
139    content: str,) -> Generator[Tuple[FileRange, str], None, None]:
140  """Extracts all the potential FileCheck directives from a string.
141
142  What constitutes a potential directive is loosely defined---we err on the side
143  of capturing more strings than is necessary, rather than missing any.
144
145  Args:
146    content: the string in which to look for directives
147
148  Yields:
149    Tuples (p, d) where p is the span where the potential directive occurs
150    within the string and d is the potential directive.
151  """
152  directive_pattern = re.compile(
153      r'(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):', re.MULTILINE)
154  for match in re.finditer(directive_pattern, content):
155    potential_directive, span = match.group(1), match.span(1)
156    yield (FileRange(content, span[0], span[1]), potential_directive)
157
158
159# TODO(bchetioui): also parse comment prefixes to ignore.
160def parse_custom_prefixes(content: str) -> Generator[str, None, None]:  # pylint: disable=g-doc-args
161  """Parses custom prefixes defined in the string provided.
162
163  For example, given the following file content:
164    RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2
165    RUN: something_else | FileCheck %s -check-prefix 'CHECK3'
166
167  the custom prefixes are CHECK1, CHECK2, and CHECK3.
168  """
169  param_re = r'|'.join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+'])
170  for m in re.finditer(r'-check-prefix(?:es)?(?:\s+|=)({})'.format(param_re),
171                       content):
172    prefixes = m.group(1)
173    if prefixes.startswith('\'') or prefixes.startswith('"'):
174      prefixes = prefixes[1:-1]
175    for prefix in prefixes.split(','):
176      yield prefix
177
178
179def find_directive_typos(
180    content: str,
181    filepath: pathlib.Path,
182    threshold: int = 3,
183) -> Generator[Diagnostic, None, None]:
184  """Detects potential typos in FileCheck directives.
185
186  Args:
187    content: the content of the file
188    filepath: the path to the file to check for typos in directives
189    threshold: the (inclusive) maximum edit distance between a potential
190      directive and an actual directive, such that the potential directive is
191      classified as a typo
192
193  Yields:
194    Diagnostics, in order from the top of the file.
195  """
196  all_prefixes = _prefixes.union(set(parse_custom_prefixes(content)))
197  all_directives = ([
198      f'{prefix}{suffix}'
199      for prefix, suffix in itertools.product(all_prefixes, _suffixes)
200  ] + list(_ignore) + list(all_prefixes))
201
202  def find_best_match(typo):
203    return min(
204        [(threshold + 1, typo)] + [(levenshtein(typo, d), d)
205                                   for d in all_directives
206                                   if abs(len(d) - len(typo)) <= threshold],
207        key=lambda tup: tup[0],
208    )
209
210  potential_directives = find_potential_directives(content)
211
212  for filerange, potential_directive in potential_directives:
213    # TODO(bchetioui): match count directives more finely. We skip directives
214    # starting with 'CHECK-COUNT-' for the moment as they require more complex
215    # logic to be handled correctly.
216    if any(
217        potential_directive.startswith(f'{prefix}-COUNT-')
218        for prefix in all_prefixes):
219      continue
220
221    # Ignoring potential typos that will not be matched later due to a too low
222    # threshold, in order to avoid potentially long computation times.
223    if len(potential_directive) > max(map(len, all_directives)) + threshold:
224      continue
225
226    score, best_match = find_best_match(potential_directive)
227    if score == 0:  # This is an actual directive, ignore.
228      continue
229    elif score <= threshold and best_match not in _ignore:
230      yield Diagnostic(filepath, filerange, potential_directive, best_match)
231
232
233def main(argv: Sequence[str]):
234  if len(argv) < 2:
235    print(f'Usage: {argv[0]} path/to/file/1 ... path/to/file/n')
236    exit(1)
237
238  for filepath in argv[1:]:
239    logging.info('Checking %s', filepath)
240    with open(filepath, 'rt') as f:
241      content = f.read()
242    for diagnostic in find_directive_typos(
243        content,
244        pathlib.Path(filepath),
245        threshold=_distance_threshold,
246    ):
247      print(diagnostic)
248
249
250if __name__ == '__main__':
251  main(sys.argv)
252