13a8d176aSJay Foad#!/usr/bin/env python3 2f702c759SBenjamin Chetioui# ===----------------------------------------------------------------------===## 3f702c759SBenjamin Chetioui# 4f702c759SBenjamin Chetioui# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5f702c759SBenjamin Chetioui# See https://llvm.org/LICENSE.txt for license information. 6f702c759SBenjamin Chetioui# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7f702c759SBenjamin Chetioui# 8f702c759SBenjamin Chetioui# ===----------------------------------------------------------------------===## 9f702c759SBenjamin Chetioui"""A linter that detects potential typos in FileCheck directive names. 10f702c759SBenjamin Chetioui 11f702c759SBenjamin ChetiouiConsider a broken test foo.cpp: 12f702c759SBenjamin Chetioui 13f702c759SBenjamin Chetioui// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW 14f702c759SBenjamin Chetioui// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD 15f702c759SBenjamin Chetiouiauto x = 42; 16f702c759SBenjamin Chetioui// NEWW: auto is a c++11 extension 17f702c759SBenjamin Chetioui// ODL-NOT: auto is a c++11 extension 18f702c759SBenjamin Chetioui 19f702c759SBenjamin ChetiouiWe first detect the locally valid FileCheck directive prefixes by parsing the 20f702c759SBenjamin Chetioui--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are 21f702c759SBenjamin Chetioui{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}. 22f702c759SBenjamin Chetioui 23f702c759SBenjamin ChetiouiThen we look for lines that look like directives. These are of the form 'FOO:', 24f702c759SBenjamin Chetiouiusually at the beginning of a line or a comment. If any of these are a 25f702c759SBenjamin Chetioui"near-miss" for a directive name, then we suspect this is a typo and report it. 26f702c759SBenjamin Chetioui 27f702c759SBenjamin ChetiouiUsage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n 28f702c759SBenjamin Chetioui""" 29f702c759SBenjamin Chetioui 30f702c759SBenjamin Chetiouiimport itertools 31f702c759SBenjamin Chetiouiimport logging 32f702c759SBenjamin Chetiouiimport pathlib 33f702c759SBenjamin Chetiouiimport re 34f702c759SBenjamin Chetiouiimport sys 35f702c759SBenjamin Chetiouifrom typing import Generator, Sequence, Tuple 36f702c759SBenjamin Chetioui 37f702c759SBenjamin Chetioui_distance_threshold = 3 38b71edfaaSTobias Hieta_prefixes = {"CHECK"} 39b71edfaaSTobias Hieta_suffixes = {"-DAG", "-COUNT", "-EMPTY", "-LABEL", "-NEXT", "-NOT", "-SAME"} 40f702c759SBenjamin Chetioui# 'NOTE' and 'TODO' are not directives, but are likely to be false positives 41f702c759SBenjamin Chetioui# if encountered and to generate noise as a result. We filter them out also to 42f702c759SBenjamin Chetioui# avoid this. 43f702c759SBenjamin Chetioui_lit_directives = { 44b71edfaaSTobias Hieta "RUN", 45b71edfaaSTobias Hieta "REQUIRES", 46b71edfaaSTobias Hieta "UNSUPPORTED", 47b71edfaaSTobias Hieta "XFAIL", 48b71edfaaSTobias Hieta "DEFINE", 49b71edfaaSTobias Hieta "REDEFINE", 50f702c759SBenjamin Chetioui} 51f702c759SBenjamin Chetioui# 'COM' and 'RUN' are default comment prefixes for FileCheck. 52b71edfaaSTobias Hieta_comment_prefixes = {"COM", "RUN"} 53b71edfaaSTobias Hieta_ignore = _lit_directives.union(_comment_prefixes).union({"NOTE", "TODO"}) 54f702c759SBenjamin Chetioui 55f702c759SBenjamin Chetioui 56f702c759SBenjamin Chetiouidef levenshtein(s1: str, s2: str) -> int: # pylint: disable=g-doc-args 57f702c759SBenjamin Chetioui """Computes the edit distance between two strings. 58f702c759SBenjamin Chetioui 59f702c759SBenjamin Chetioui Additions, deletions, and substitutions all count as a single operation. 60f702c759SBenjamin Chetioui """ 61f702c759SBenjamin Chetioui if not s1: 62f702c759SBenjamin Chetioui return len(s2) 63f702c759SBenjamin Chetioui if not s2: 64f702c759SBenjamin Chetioui return len(s1) 65f702c759SBenjamin Chetioui 66f702c759SBenjamin Chetioui distances = range(len(s2) + 1) 67f702c759SBenjamin Chetioui for i in range(len(s1)): 68f702c759SBenjamin Chetioui new_distances = [i + 1] 69f702c759SBenjamin Chetioui for j in range(len(s2)): 70b71edfaaSTobias Hieta cost = min( 71b71edfaaSTobias Hieta distances[j] + int(s1[i] != s2[j]), 72b71edfaaSTobias Hieta distances[j + 1] + 1, 73b71edfaaSTobias Hieta new_distances[-1] + 1, 74b71edfaaSTobias Hieta ) 75f702c759SBenjamin Chetioui new_distances.append(cost) 76f702c759SBenjamin Chetioui distances = new_distances 77f702c759SBenjamin Chetioui return distances[-1] 78f702c759SBenjamin Chetioui 79f702c759SBenjamin Chetioui 80f702c759SBenjamin Chetiouiclass FileRange: 81f702c759SBenjamin Chetioui """Stores the coordinates of a span on a single line within a file. 82f702c759SBenjamin Chetioui 83f702c759SBenjamin Chetioui Attributes: 84*42ebf3eaSklensy content: line str 85*42ebf3eaSklensy start_byte: the (inclusive) byte offset the span starts 86*42ebf3eaSklensy end_byte: the (inclusive) byte offset the span ends 87f702c759SBenjamin Chetioui """ 88b71edfaaSTobias Hieta 89*42ebf3eaSklensy content: str 90*42ebf3eaSklensy start_byte: int 91*42ebf3eaSklensy end_byte: int 92f702c759SBenjamin Chetioui 93b71edfaaSTobias Hieta def __init__( 94b71edfaaSTobias Hieta self, content: str, start_byte: int, end_byte: int 95b71edfaaSTobias Hieta ): # pylint: disable=g-doc-args 96*42ebf3eaSklensy """ 97*42ebf3eaSklensy Stores the coordinates of a span based on a string and start/end bytes. 98f702c759SBenjamin Chetioui 99f702c759SBenjamin Chetioui `start_byte` and `end_byte` are assumed to be on the same line. 100f702c759SBenjamin Chetioui """ 101*42ebf3eaSklensy self.content = content 102*42ebf3eaSklensy self.start_byte = start_byte 103*42ebf3eaSklensy self.end_byte = end_byte 104f702c759SBenjamin Chetioui 105*42ebf3eaSklensy def as_str(self): 106*42ebf3eaSklensy """ 107*42ebf3eaSklensy Derives span from line and coordinates. 108*42ebf3eaSklensy 109*42ebf3eaSklensy start_column: the (inclusive) column where the span starts 110*42ebf3eaSklensy end_column: the (inclusive) column where the span ends 111*42ebf3eaSklensy """ 112*42ebf3eaSklensy content_before_span = self.content[: self.start_byte] 113*42ebf3eaSklensy line = content_before_span.count("\n") + 1 114*42ebf3eaSklensy start_column = self.start_byte - content_before_span.rfind("\n") 115*42ebf3eaSklensy end_column = start_column + (self.end_byte - self.start_byte - 1) 116*42ebf3eaSklensy 117*42ebf3eaSklensy return f"{line}:{start_column}-{end_column}" 118f702c759SBenjamin Chetioui 119f702c759SBenjamin Chetioui 120f702c759SBenjamin Chetiouiclass Diagnostic: 121f702c759SBenjamin Chetioui """Stores information about one typo and a suggested fix. 122f702c759SBenjamin Chetioui 123f702c759SBenjamin Chetioui Attributes: 124f702c759SBenjamin Chetioui filepath: the path to the file in which the typo was found 125f702c759SBenjamin Chetioui filerange: the position at which the typo was found in the file 126f702c759SBenjamin Chetioui typo: the typo 127f702c759SBenjamin Chetioui fix: a suggested fix 128f702c759SBenjamin Chetioui """ 129f702c759SBenjamin Chetioui 130f702c759SBenjamin Chetioui filepath: pathlib.Path 131f702c759SBenjamin Chetioui filerange: FileRange 132f702c759SBenjamin Chetioui typo: str 133f702c759SBenjamin Chetioui fix: str 134f702c759SBenjamin Chetioui 135f702c759SBenjamin Chetioui def __init__( 136f702c759SBenjamin Chetioui self, 137f702c759SBenjamin Chetioui filepath: pathlib.Path, 138f702c759SBenjamin Chetioui filerange: FileRange, 139f702c759SBenjamin Chetioui typo: str, 140b71edfaaSTobias Hieta fix: str, # pylint: disable=redefined-outer-name 141f702c759SBenjamin Chetioui ): 142f702c759SBenjamin Chetioui self.filepath = filepath 143f702c759SBenjamin Chetioui self.filerange = filerange 144f702c759SBenjamin Chetioui self.typo = typo 145f702c759SBenjamin Chetioui self.fix = fix 146f702c759SBenjamin Chetioui 147f702c759SBenjamin Chetioui def __str__(self) -> str: 148*42ebf3eaSklensy return f"{self.filepath}:" + self.filerange.as_str() + f": {self.summary()}" 149f702c759SBenjamin Chetioui 150f702c759SBenjamin Chetioui def summary(self) -> str: 151f702c759SBenjamin Chetioui return ( 152f702c759SBenjamin Chetioui f'Found potentially misspelled directive "{self.typo}". Did you mean ' 153b71edfaaSTobias Hieta f'"{self.fix}"?' 154b71edfaaSTobias Hieta ) 155f702c759SBenjamin Chetioui 156f702c759SBenjamin Chetioui 157f702c759SBenjamin Chetiouidef find_potential_directives( 158b71edfaaSTobias Hieta content: str, 159b71edfaaSTobias Hieta) -> Generator[Tuple[FileRange, str], None, None]: 160f702c759SBenjamin Chetioui """Extracts all the potential FileCheck directives from a string. 161f702c759SBenjamin Chetioui 162f702c759SBenjamin Chetioui What constitutes a potential directive is loosely defined---we err on the side 163f702c759SBenjamin Chetioui of capturing more strings than is necessary, rather than missing any. 164f702c759SBenjamin Chetioui 165f702c759SBenjamin Chetioui Args: 166f702c759SBenjamin Chetioui content: the string in which to look for directives 167f702c759SBenjamin Chetioui 168f702c759SBenjamin Chetioui Yields: 169f702c759SBenjamin Chetioui Tuples (p, d) where p is the span where the potential directive occurs 170f702c759SBenjamin Chetioui within the string and d is the potential directive. 171f702c759SBenjamin Chetioui """ 172f702c759SBenjamin Chetioui directive_pattern = re.compile( 173b71edfaaSTobias Hieta r"(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):", re.MULTILINE 174b71edfaaSTobias Hieta ) 175f702c759SBenjamin Chetioui for match in re.finditer(directive_pattern, content): 176f702c759SBenjamin Chetioui potential_directive, span = match.group(1), match.span(1) 177f702c759SBenjamin Chetioui yield (FileRange(content, span[0], span[1]), potential_directive) 178f702c759SBenjamin Chetioui 179f702c759SBenjamin Chetioui 180f702c759SBenjamin Chetioui# TODO(bchetioui): also parse comment prefixes to ignore. 181b71edfaaSTobias Hietadef parse_custom_prefixes( 182b71edfaaSTobias Hieta content: str, 183b71edfaaSTobias Hieta) -> Generator[str, None, None]: # pylint: disable=g-doc-args 184f702c759SBenjamin Chetioui """Parses custom prefixes defined in the string provided. 185f702c759SBenjamin Chetioui 186f702c759SBenjamin Chetioui For example, given the following file content: 187f702c759SBenjamin Chetioui RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2 188f702c759SBenjamin Chetioui RUN: something_else | FileCheck %s -check-prefix 'CHECK3' 189f702c759SBenjamin Chetioui 190f702c759SBenjamin Chetioui the custom prefixes are CHECK1, CHECK2, and CHECK3. 191f702c759SBenjamin Chetioui """ 192b71edfaaSTobias Hieta param_re = r"|".join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+']) 193b71edfaaSTobias Hieta for m in re.finditer( 194b71edfaaSTobias Hieta r"-check-prefix(?:es)?(?:\s+|=)({})".format(param_re), content 195b71edfaaSTobias Hieta ): 196f702c759SBenjamin Chetioui prefixes = m.group(1) 197b71edfaaSTobias Hieta if prefixes.startswith("'") or prefixes.startswith('"'): 198f702c759SBenjamin Chetioui prefixes = prefixes[1:-1] 199b71edfaaSTobias Hieta for prefix in prefixes.split(","): 200f702c759SBenjamin Chetioui yield prefix 201f702c759SBenjamin Chetioui 202f702c759SBenjamin Chetioui 203f702c759SBenjamin Chetiouidef find_directive_typos( 204f702c759SBenjamin Chetioui content: str, 205f702c759SBenjamin Chetioui filepath: pathlib.Path, 206f702c759SBenjamin Chetioui threshold: int = 3, 207f702c759SBenjamin Chetioui) -> Generator[Diagnostic, None, None]: 208f702c759SBenjamin Chetioui """Detects potential typos in FileCheck directives. 209f702c759SBenjamin Chetioui 210f702c759SBenjamin Chetioui Args: 211f702c759SBenjamin Chetioui content: the content of the file 212f702c759SBenjamin Chetioui filepath: the path to the file to check for typos in directives 213f702c759SBenjamin Chetioui threshold: the (inclusive) maximum edit distance between a potential 214f702c759SBenjamin Chetioui directive and an actual directive, such that the potential directive is 215f702c759SBenjamin Chetioui classified as a typo 216f702c759SBenjamin Chetioui 217f702c759SBenjamin Chetioui Yields: 218f702c759SBenjamin Chetioui Diagnostics, in order from the top of the file. 219f702c759SBenjamin Chetioui """ 220f702c759SBenjamin Chetioui all_prefixes = _prefixes.union(set(parse_custom_prefixes(content))) 221b71edfaaSTobias Hieta all_directives = ( 222b71edfaaSTobias Hieta [ 223b71edfaaSTobias Hieta f"{prefix}{suffix}" 224f702c759SBenjamin Chetioui for prefix, suffix in itertools.product(all_prefixes, _suffixes) 225b71edfaaSTobias Hieta ] 226b71edfaaSTobias Hieta + list(_ignore) 227b71edfaaSTobias Hieta + list(all_prefixes) 228b71edfaaSTobias Hieta ) 229f702c759SBenjamin Chetioui 230f702c759SBenjamin Chetioui def find_best_match(typo): 231f702c759SBenjamin Chetioui return min( 232b71edfaaSTobias Hieta [(threshold + 1, typo)] 233b71edfaaSTobias Hieta + [ 234b71edfaaSTobias Hieta (levenshtein(typo, d), d) 235f702c759SBenjamin Chetioui for d in all_directives 236b71edfaaSTobias Hieta if abs(len(d) - len(typo)) <= threshold 237b71edfaaSTobias Hieta ], 238f702c759SBenjamin Chetioui key=lambda tup: tup[0], 239f702c759SBenjamin Chetioui ) 240f702c759SBenjamin Chetioui 241f702c759SBenjamin Chetioui potential_directives = find_potential_directives(content) 242*42ebf3eaSklensy # Cache score and best_match to skip recalculating. 243*42ebf3eaSklensy score_and_best_match_for_potential_directive = dict() 244f702c759SBenjamin Chetioui for filerange, potential_directive in potential_directives: 245f702c759SBenjamin Chetioui # TODO(bchetioui): match count directives more finely. We skip directives 246f702c759SBenjamin Chetioui # starting with 'CHECK-COUNT-' for the moment as they require more complex 247f702c759SBenjamin Chetioui # logic to be handled correctly. 248f702c759SBenjamin Chetioui if any( 249b71edfaaSTobias Hieta potential_directive.startswith(f"{prefix}-COUNT-") 250b71edfaaSTobias Hieta for prefix in all_prefixes 251b71edfaaSTobias Hieta ): 252f702c759SBenjamin Chetioui continue 253f702c759SBenjamin Chetioui 254f702c759SBenjamin Chetioui # Ignoring potential typos that will not be matched later due to a too low 255f702c759SBenjamin Chetioui # threshold, in order to avoid potentially long computation times. 256f702c759SBenjamin Chetioui if len(potential_directive) > max(map(len, all_directives)) + threshold: 257f702c759SBenjamin Chetioui continue 258f702c759SBenjamin Chetioui 259*42ebf3eaSklensy if potential_directive not in score_and_best_match_for_potential_directive: 260f702c759SBenjamin Chetioui score, best_match = find_best_match(potential_directive) 261*42ebf3eaSklensy score_and_best_match_for_potential_directive[potential_directive] = ( 262*42ebf3eaSklensy score, 263*42ebf3eaSklensy best_match, 264*42ebf3eaSklensy ) 265*42ebf3eaSklensy else: 266*42ebf3eaSklensy score, best_match = score_and_best_match_for_potential_directive[ 267*42ebf3eaSklensy potential_directive 268*42ebf3eaSklensy ] 269f702c759SBenjamin Chetioui if score == 0: # This is an actual directive, ignore. 270f702c759SBenjamin Chetioui continue 271f702c759SBenjamin Chetioui elif score <= threshold and best_match not in _ignore: 272f702c759SBenjamin Chetioui yield Diagnostic(filepath, filerange, potential_directive, best_match) 273f702c759SBenjamin Chetioui 274f702c759SBenjamin Chetioui 275f702c759SBenjamin Chetiouidef main(argv: Sequence[str]): 276f702c759SBenjamin Chetioui if len(argv) < 2: 277b71edfaaSTobias Hieta print(f"Usage: {argv[0]} path/to/file/1 ... path/to/file/n") 278f702c759SBenjamin Chetioui exit(1) 279f702c759SBenjamin Chetioui 280f702c759SBenjamin Chetioui for filepath in argv[1:]: 281b71edfaaSTobias Hieta logging.info("Checking %s", filepath) 282b71edfaaSTobias Hieta with open(filepath, "rt") as f: 283f702c759SBenjamin Chetioui content = f.read() 284f702c759SBenjamin Chetioui for diagnostic in find_directive_typos( 285f702c759SBenjamin Chetioui content, 286f702c759SBenjamin Chetioui pathlib.Path(filepath), 287f702c759SBenjamin Chetioui threshold=_distance_threshold, 288f702c759SBenjamin Chetioui ): 289f702c759SBenjamin Chetioui print(diagnostic) 290f702c759SBenjamin Chetioui 291f702c759SBenjamin Chetioui 292b71edfaaSTobias Hietaif __name__ == "__main__": 293f702c759SBenjamin Chetioui main(sys.argv) 294