1# ===----------------------------------------------------------------------===## 2# 3# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4# See https://llvm.org/LICENSE.txt for license information. 5# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6# 7# ===----------------------------------------------------------------------===## 8"""A linter that detects potential typos in FileCheck directive names. 9 10Consider a broken test foo.cpp: 11 12// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW 13// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD 14auto x = 42; 15// NEWW: auto is a c++11 extension 16// ODL-NOT: auto is a c++11 extension 17 18We first detect the locally valid FileCheck directive prefixes by parsing the 19--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are 20{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}. 21 22Then we look for lines that look like directives. These are of the form 'FOO:', 23usually at the beginning of a line or a comment. If any of these are a 24"near-miss" for a directive name, then we suspect this is a typo and report it. 25 26Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n 27""" 28 29import itertools 30import logging 31import pathlib 32import re 33import sys 34from typing import Generator, Sequence, Tuple 35 36_distance_threshold = 3 37_prefixes = {"CHECK"} 38_suffixes = {"-DAG", "-COUNT", "-EMPTY", "-LABEL", "-NEXT", "-NOT", "-SAME"} 39# 'NOTE' and 'TODO' are not directives, but are likely to be false positives 40# if encountered and to generate noise as a result. We filter them out also to 41# avoid this. 42_lit_directives = { 43 "RUN", 44 "REQUIRES", 45 "UNSUPPORTED", 46 "XFAIL", 47 "DEFINE", 48 "REDEFINE", 49} 50# 'COM' and 'RUN' are default comment prefixes for FileCheck. 51_comment_prefixes = {"COM", "RUN"} 52_ignore = _lit_directives.union(_comment_prefixes).union({"NOTE", "TODO"}) 53 54 55def levenshtein(s1: str, s2: str) -> int: # pylint: disable=g-doc-args 56 """Computes the edit distance between two strings. 57 58 Additions, deletions, and substitutions all count as a single operation. 59 """ 60 if not s1: 61 return len(s2) 62 if not s2: 63 return len(s1) 64 65 distances = range(len(s2) + 1) 66 for i in range(len(s1)): 67 new_distances = [i + 1] 68 for j in range(len(s2)): 69 cost = min( 70 distances[j] + int(s1[i] != s2[j]), 71 distances[j + 1] + 1, 72 new_distances[-1] + 1, 73 ) 74 new_distances.append(cost) 75 distances = new_distances 76 return distances[-1] 77 78 79class FileRange: 80 """Stores the coordinates of a span on a single line within a file. 81 82 Attributes: 83 line: the line number 84 start_column: the (inclusive) column where the span starts 85 end_column: the (inclusive) column where the span ends 86 """ 87 88 line: int 89 start_column: int 90 end_column: int 91 92 def __init__( 93 self, content: str, start_byte: int, end_byte: int 94 ): # pylint: disable=g-doc-args 95 """Derives a span's coordinates based on a string and start/end bytes. 96 97 `start_byte` and `end_byte` are assumed to be on the same line. 98 """ 99 content_before_span = content[:start_byte] 100 self.line = content_before_span.count("\n") + 1 101 self.start_column = start_byte - content_before_span.rfind("\n") 102 self.end_column = self.start_column + (end_byte - start_byte - 1) 103 104 def __str__(self) -> str: 105 return f"{self.line}:{self.start_column}-{self.end_column}" 106 107 108class Diagnostic: 109 """Stores information about one typo and a suggested fix. 110 111 Attributes: 112 filepath: the path to the file in which the typo was found 113 filerange: the position at which the typo was found in the file 114 typo: the typo 115 fix: a suggested fix 116 """ 117 118 filepath: pathlib.Path 119 filerange: FileRange 120 typo: str 121 fix: str 122 123 def __init__( 124 self, 125 filepath: pathlib.Path, 126 filerange: FileRange, 127 typo: str, 128 fix: str, # pylint: disable=redefined-outer-name 129 ): 130 self.filepath = filepath 131 self.filerange = filerange 132 self.typo = typo 133 self.fix = fix 134 135 def __str__(self) -> str: 136 return f"{self.filepath}:" + str(self.filerange) + f": {self.summary()}" 137 138 def summary(self) -> str: 139 return ( 140 f'Found potentially misspelled directive "{self.typo}". Did you mean ' 141 f'"{self.fix}"?' 142 ) 143 144 145def find_potential_directives( 146 content: str, 147) -> Generator[Tuple[FileRange, str], None, None]: 148 """Extracts all the potential FileCheck directives from a string. 149 150 What constitutes a potential directive is loosely defined---we err on the side 151 of capturing more strings than is necessary, rather than missing any. 152 153 Args: 154 content: the string in which to look for directives 155 156 Yields: 157 Tuples (p, d) where p is the span where the potential directive occurs 158 within the string and d is the potential directive. 159 """ 160 directive_pattern = re.compile( 161 r"(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):", re.MULTILINE 162 ) 163 for match in re.finditer(directive_pattern, content): 164 potential_directive, span = match.group(1), match.span(1) 165 yield (FileRange(content, span[0], span[1]), potential_directive) 166 167 168# TODO(bchetioui): also parse comment prefixes to ignore. 169def parse_custom_prefixes( 170 content: str, 171) -> Generator[str, None, None]: # pylint: disable=g-doc-args 172 """Parses custom prefixes defined in the string provided. 173 174 For example, given the following file content: 175 RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2 176 RUN: something_else | FileCheck %s -check-prefix 'CHECK3' 177 178 the custom prefixes are CHECK1, CHECK2, and CHECK3. 179 """ 180 param_re = r"|".join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+']) 181 for m in re.finditer( 182 r"-check-prefix(?:es)?(?:\s+|=)({})".format(param_re), content 183 ): 184 prefixes = m.group(1) 185 if prefixes.startswith("'") or prefixes.startswith('"'): 186 prefixes = prefixes[1:-1] 187 for prefix in prefixes.split(","): 188 yield prefix 189 190 191def find_directive_typos( 192 content: str, 193 filepath: pathlib.Path, 194 threshold: int = 3, 195) -> Generator[Diagnostic, None, None]: 196 """Detects potential typos in FileCheck directives. 197 198 Args: 199 content: the content of the file 200 filepath: the path to the file to check for typos in directives 201 threshold: the (inclusive) maximum edit distance between a potential 202 directive and an actual directive, such that the potential directive is 203 classified as a typo 204 205 Yields: 206 Diagnostics, in order from the top of the file. 207 """ 208 all_prefixes = _prefixes.union(set(parse_custom_prefixes(content))) 209 all_directives = ( 210 [ 211 f"{prefix}{suffix}" 212 for prefix, suffix in itertools.product(all_prefixes, _suffixes) 213 ] 214 + list(_ignore) 215 + list(all_prefixes) 216 ) 217 218 def find_best_match(typo): 219 return min( 220 [(threshold + 1, typo)] 221 + [ 222 (levenshtein(typo, d), d) 223 for d in all_directives 224 if abs(len(d) - len(typo)) <= threshold 225 ], 226 key=lambda tup: tup[0], 227 ) 228 229 potential_directives = find_potential_directives(content) 230 231 for filerange, potential_directive in potential_directives: 232 # TODO(bchetioui): match count directives more finely. We skip directives 233 # starting with 'CHECK-COUNT-' for the moment as they require more complex 234 # logic to be handled correctly. 235 if any( 236 potential_directive.startswith(f"{prefix}-COUNT-") 237 for prefix in all_prefixes 238 ): 239 continue 240 241 # Ignoring potential typos that will not be matched later due to a too low 242 # threshold, in order to avoid potentially long computation times. 243 if len(potential_directive) > max(map(len, all_directives)) + threshold: 244 continue 245 246 score, best_match = find_best_match(potential_directive) 247 if score == 0: # This is an actual directive, ignore. 248 continue 249 elif score <= threshold and best_match not in _ignore: 250 yield Diagnostic(filepath, filerange, potential_directive, best_match) 251 252 253def main(argv: Sequence[str]): 254 if len(argv) < 2: 255 print(f"Usage: {argv[0]} path/to/file/1 ... path/to/file/n") 256 exit(1) 257 258 for filepath in argv[1:]: 259 logging.info("Checking %s", filepath) 260 with open(filepath, "rt") as f: 261 content = f.read() 262 for diagnostic in find_directive_typos( 263 content, 264 pathlib.Path(filepath), 265 threshold=_distance_threshold, 266 ): 267 print(diagnostic) 268 269 270if __name__ == "__main__": 271 main(sys.argv) 272