1#!/usr/bin/env python3 2# ===----------------------------------------------------------------------===## 3# 4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5# See https://llvm.org/LICENSE.txt for license information. 6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7# 8# ===----------------------------------------------------------------------===## 9"""A linter that detects potential typos in FileCheck directive names. 10 11Consider a broken test foo.cpp: 12 13// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW 14// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD 15auto x = 42; 16// NEWW: auto is a c++11 extension 17// ODL-NOT: auto is a c++11 extension 18 19We first detect the locally valid FileCheck directive prefixes by parsing the 20--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are 21{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}. 22 23Then we look for lines that look like directives. These are of the form 'FOO:', 24usually at the beginning of a line or a comment. If any of these are a 25"near-miss" for a directive name, then we suspect this is a typo and report it. 26 27Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n 28""" 29 30import itertools 31import logging 32import pathlib 33import re 34import sys 35from typing import Generator, Sequence, Tuple 36 37_distance_threshold = 3 38_prefixes = {"CHECK"} 39_suffixes = {"-DAG", "-COUNT", "-EMPTY", "-LABEL", "-NEXT", "-NOT", "-SAME"} 40# 'NOTE' and 'TODO' are not directives, but are likely to be false positives 41# if encountered and to generate noise as a result. We filter them out also to 42# avoid this. 43_lit_directives = { 44 "RUN", 45 "REQUIRES", 46 "UNSUPPORTED", 47 "XFAIL", 48 "DEFINE", 49 "REDEFINE", 50} 51# 'COM' and 'RUN' are default comment prefixes for FileCheck. 52_comment_prefixes = {"COM", "RUN"} 53_ignore = _lit_directives.union(_comment_prefixes).union({"NOTE", "TODO"}) 54 55 56def levenshtein(s1: str, s2: str) -> int: # pylint: disable=g-doc-args 57 """Computes the edit distance between two strings. 58 59 Additions, deletions, and substitutions all count as a single operation. 60 """ 61 if not s1: 62 return len(s2) 63 if not s2: 64 return len(s1) 65 66 distances = range(len(s2) + 1) 67 for i in range(len(s1)): 68 new_distances = [i + 1] 69 for j in range(len(s2)): 70 cost = min( 71 distances[j] + int(s1[i] != s2[j]), 72 distances[j + 1] + 1, 73 new_distances[-1] + 1, 74 ) 75 new_distances.append(cost) 76 distances = new_distances 77 return distances[-1] 78 79 80class FileRange: 81 """Stores the coordinates of a span on a single line within a file. 82 83 Attributes: 84 content: line str 85 start_byte: the (inclusive) byte offset the span starts 86 end_byte: the (inclusive) byte offset the span ends 87 """ 88 89 content: str 90 start_byte: int 91 end_byte: int 92 93 def __init__( 94 self, content: str, start_byte: int, end_byte: int 95 ): # pylint: disable=g-doc-args 96 """ 97 Stores the coordinates of a span based on a string and start/end bytes. 98 99 `start_byte` and `end_byte` are assumed to be on the same line. 100 """ 101 self.content = content 102 self.start_byte = start_byte 103 self.end_byte = end_byte 104 105 def as_str(self): 106 """ 107 Derives span from line and coordinates. 108 109 start_column: the (inclusive) column where the span starts 110 end_column: the (inclusive) column where the span ends 111 """ 112 content_before_span = self.content[: self.start_byte] 113 line = content_before_span.count("\n") + 1 114 start_column = self.start_byte - content_before_span.rfind("\n") 115 end_column = start_column + (self.end_byte - self.start_byte - 1) 116 117 return f"{line}:{start_column}-{end_column}" 118 119 120class Diagnostic: 121 """Stores information about one typo and a suggested fix. 122 123 Attributes: 124 filepath: the path to the file in which the typo was found 125 filerange: the position at which the typo was found in the file 126 typo: the typo 127 fix: a suggested fix 128 """ 129 130 filepath: pathlib.Path 131 filerange: FileRange 132 typo: str 133 fix: str 134 135 def __init__( 136 self, 137 filepath: pathlib.Path, 138 filerange: FileRange, 139 typo: str, 140 fix: str, # pylint: disable=redefined-outer-name 141 ): 142 self.filepath = filepath 143 self.filerange = filerange 144 self.typo = typo 145 self.fix = fix 146 147 def __str__(self) -> str: 148 return f"{self.filepath}:" + self.filerange.as_str() + f": {self.summary()}" 149 150 def summary(self) -> str: 151 return ( 152 f'Found potentially misspelled directive "{self.typo}". Did you mean ' 153 f'"{self.fix}"?' 154 ) 155 156 157def find_potential_directives( 158 content: str, 159) -> Generator[Tuple[FileRange, str], None, None]: 160 """Extracts all the potential FileCheck directives from a string. 161 162 What constitutes a potential directive is loosely defined---we err on the side 163 of capturing more strings than is necessary, rather than missing any. 164 165 Args: 166 content: the string in which to look for directives 167 168 Yields: 169 Tuples (p, d) where p is the span where the potential directive occurs 170 within the string and d is the potential directive. 171 """ 172 directive_pattern = re.compile( 173 r"(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):", re.MULTILINE 174 ) 175 for match in re.finditer(directive_pattern, content): 176 potential_directive, span = match.group(1), match.span(1) 177 yield (FileRange(content, span[0], span[1]), potential_directive) 178 179 180# TODO(bchetioui): also parse comment prefixes to ignore. 181def parse_custom_prefixes( 182 content: str, 183) -> Generator[str, None, None]: # pylint: disable=g-doc-args 184 """Parses custom prefixes defined in the string provided. 185 186 For example, given the following file content: 187 RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2 188 RUN: something_else | FileCheck %s -check-prefix 'CHECK3' 189 190 the custom prefixes are CHECK1, CHECK2, and CHECK3. 191 """ 192 param_re = r"|".join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+']) 193 for m in re.finditer( 194 r"-check-prefix(?:es)?(?:\s+|=)({})".format(param_re), content 195 ): 196 prefixes = m.group(1) 197 if prefixes.startswith("'") or prefixes.startswith('"'): 198 prefixes = prefixes[1:-1] 199 for prefix in prefixes.split(","): 200 yield prefix 201 202 203def find_directive_typos( 204 content: str, 205 filepath: pathlib.Path, 206 threshold: int = 3, 207) -> Generator[Diagnostic, None, None]: 208 """Detects potential typos in FileCheck directives. 209 210 Args: 211 content: the content of the file 212 filepath: the path to the file to check for typos in directives 213 threshold: the (inclusive) maximum edit distance between a potential 214 directive and an actual directive, such that the potential directive is 215 classified as a typo 216 217 Yields: 218 Diagnostics, in order from the top of the file. 219 """ 220 all_prefixes = _prefixes.union(set(parse_custom_prefixes(content))) 221 all_directives = ( 222 [ 223 f"{prefix}{suffix}" 224 for prefix, suffix in itertools.product(all_prefixes, _suffixes) 225 ] 226 + list(_ignore) 227 + list(all_prefixes) 228 ) 229 230 def find_best_match(typo): 231 return min( 232 [(threshold + 1, typo)] 233 + [ 234 (levenshtein(typo, d), d) 235 for d in all_directives 236 if abs(len(d) - len(typo)) <= threshold 237 ], 238 key=lambda tup: tup[0], 239 ) 240 241 potential_directives = find_potential_directives(content) 242 # Cache score and best_match to skip recalculating. 243 score_and_best_match_for_potential_directive = dict() 244 for filerange, potential_directive in potential_directives: 245 # TODO(bchetioui): match count directives more finely. We skip directives 246 # starting with 'CHECK-COUNT-' for the moment as they require more complex 247 # logic to be handled correctly. 248 if any( 249 potential_directive.startswith(f"{prefix}-COUNT-") 250 for prefix in all_prefixes 251 ): 252 continue 253 254 # Ignoring potential typos that will not be matched later due to a too low 255 # threshold, in order to avoid potentially long computation times. 256 if len(potential_directive) > max(map(len, all_directives)) + threshold: 257 continue 258 259 if potential_directive not in score_and_best_match_for_potential_directive: 260 score, best_match = find_best_match(potential_directive) 261 score_and_best_match_for_potential_directive[potential_directive] = ( 262 score, 263 best_match, 264 ) 265 else: 266 score, best_match = score_and_best_match_for_potential_directive[ 267 potential_directive 268 ] 269 if score == 0: # This is an actual directive, ignore. 270 continue 271 elif score <= threshold and best_match not in _ignore: 272 yield Diagnostic(filepath, filerange, potential_directive, best_match) 273 274 275def main(argv: Sequence[str]): 276 if len(argv) < 2: 277 print(f"Usage: {argv[0]} path/to/file/1 ... path/to/file/n") 278 exit(1) 279 280 for filepath in argv[1:]: 281 logging.info("Checking %s", filepath) 282 with open(filepath, "rt") as f: 283 content = f.read() 284 for diagnostic in find_directive_typos( 285 content, 286 pathlib.Path(filepath), 287 threshold=_distance_threshold, 288 ): 289 print(diagnostic) 290 291 292if __name__ == "__main__": 293 main(sys.argv) 294