1# ===----------------------------------------------------------------------===## 2# 3# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4# See https://llvm.org/LICENSE.txt for license information. 5# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6# 7# ===----------------------------------------------------------------------===## 8"""A linter that detects potential typos in FileCheck directive names. 9 10Consider a broken test foo.cpp: 11 12// RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW 13// RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD 14auto x = 42; 15// NEWW: auto is a c++11 extension 16// ODL-NOT: auto is a c++11 extension 17 18We first detect the locally valid FileCheck directive prefixes by parsing the 19--check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are 20{CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}. 21 22Then we look for lines that look like directives. These are of the form 'FOO:', 23usually at the beginning of a line or a comment. If any of these are a 24"near-miss" for a directive name, then we suspect this is a typo and report it. 25 26Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n 27""" 28 29import itertools 30import logging 31import pathlib 32import re 33import sys 34from typing import Generator, Sequence, Tuple 35 36_distance_threshold = 3 37_prefixes = {'CHECK'} 38_suffixes = {'-DAG', '-COUNT', '-EMPTY', '-LABEL', '-NEXT', '-NOT', '-SAME'} 39# 'NOTE' and 'TODO' are not directives, but are likely to be false positives 40# if encountered and to generate noise as a result. We filter them out also to 41# avoid this. 42_lit_directives = { 43 'RUN', 44 'REQUIRES', 45 'UNSUPPORTED', 46 'XFAIL', 47 'DEFINE', 48 'REDEFINE', 49} 50# 'COM' and 'RUN' are default comment prefixes for FileCheck. 51_comment_prefixes = {'COM', 'RUN'} 52_ignore = _lit_directives.union(_comment_prefixes).union({'NOTE', 'TODO'}) 53 54 55def levenshtein(s1: str, s2: str) -> int: # pylint: disable=g-doc-args 56 """Computes the edit distance between two strings. 57 58 Additions, deletions, and substitutions all count as a single operation. 59 """ 60 if not s1: 61 return len(s2) 62 if not s2: 63 return len(s1) 64 65 distances = range(len(s2) + 1) 66 for i in range(len(s1)): 67 new_distances = [i + 1] 68 for j in range(len(s2)): 69 cost = min(distances[j] + int(s1[i] != s2[j]), distances[j + 1] + 1, 70 new_distances[-1] + 1) 71 new_distances.append(cost) 72 distances = new_distances 73 return distances[-1] 74 75 76class FileRange: 77 """Stores the coordinates of a span on a single line within a file. 78 79 Attributes: 80 line: the line number 81 start_column: the (inclusive) column where the span starts 82 end_column: the (inclusive) column where the span ends 83 """ 84 line: int 85 start_column: int 86 end_column: int 87 88 def __init__(self, content: str, start_byte: int, end_byte: int): # pylint: disable=g-doc-args 89 """Derives a span's coordinates based on a string and start/end bytes. 90 91 `start_byte` and `end_byte` are assumed to be on the same line. 92 """ 93 content_before_span = content[:start_byte] 94 self.line = content_before_span.count('\n') + 1 95 self.start_column = start_byte - content_before_span.rfind('\n') 96 self.end_column = self.start_column + (end_byte - start_byte - 1) 97 98 def __str__(self) -> str: 99 return f'{self.line}:{self.start_column}-{self.end_column}' 100 101 102class Diagnostic: 103 """Stores information about one typo and a suggested fix. 104 105 Attributes: 106 filepath: the path to the file in which the typo was found 107 filerange: the position at which the typo was found in the file 108 typo: the typo 109 fix: a suggested fix 110 """ 111 112 filepath: pathlib.Path 113 filerange: FileRange 114 typo: str 115 fix: str 116 117 def __init__( 118 self, 119 filepath: pathlib.Path, 120 filerange: FileRange, 121 typo: str, 122 fix: str # pylint: disable=redefined-outer-name 123 ): 124 self.filepath = filepath 125 self.filerange = filerange 126 self.typo = typo 127 self.fix = fix 128 129 def __str__(self) -> str: 130 return f'{self.filepath}:' + str(self.filerange) + f': {self.summary()}' 131 132 def summary(self) -> str: 133 return ( 134 f'Found potentially misspelled directive "{self.typo}". Did you mean ' 135 f'"{self.fix}"?') 136 137 138def find_potential_directives( 139 content: str,) -> Generator[Tuple[FileRange, str], None, None]: 140 """Extracts all the potential FileCheck directives from a string. 141 142 What constitutes a potential directive is loosely defined---we err on the side 143 of capturing more strings than is necessary, rather than missing any. 144 145 Args: 146 content: the string in which to look for directives 147 148 Yields: 149 Tuples (p, d) where p is the span where the potential directive occurs 150 within the string and d is the potential directive. 151 """ 152 directive_pattern = re.compile( 153 r'(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):', re.MULTILINE) 154 for match in re.finditer(directive_pattern, content): 155 potential_directive, span = match.group(1), match.span(1) 156 yield (FileRange(content, span[0], span[1]), potential_directive) 157 158 159# TODO(bchetioui): also parse comment prefixes to ignore. 160def parse_custom_prefixes(content: str) -> Generator[str, None, None]: # pylint: disable=g-doc-args 161 """Parses custom prefixes defined in the string provided. 162 163 For example, given the following file content: 164 RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2 165 RUN: something_else | FileCheck %s -check-prefix 'CHECK3' 166 167 the custom prefixes are CHECK1, CHECK2, and CHECK3. 168 """ 169 param_re = r'|'.join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+']) 170 for m in re.finditer(r'-check-prefix(?:es)?(?:\s+|=)({})'.format(param_re), 171 content): 172 prefixes = m.group(1) 173 if prefixes.startswith('\'') or prefixes.startswith('"'): 174 prefixes = prefixes[1:-1] 175 for prefix in prefixes.split(','): 176 yield prefix 177 178 179def find_directive_typos( 180 content: str, 181 filepath: pathlib.Path, 182 threshold: int = 3, 183) -> Generator[Diagnostic, None, None]: 184 """Detects potential typos in FileCheck directives. 185 186 Args: 187 content: the content of the file 188 filepath: the path to the file to check for typos in directives 189 threshold: the (inclusive) maximum edit distance between a potential 190 directive and an actual directive, such that the potential directive is 191 classified as a typo 192 193 Yields: 194 Diagnostics, in order from the top of the file. 195 """ 196 all_prefixes = _prefixes.union(set(parse_custom_prefixes(content))) 197 all_directives = ([ 198 f'{prefix}{suffix}' 199 for prefix, suffix in itertools.product(all_prefixes, _suffixes) 200 ] + list(_ignore) + list(all_prefixes)) 201 202 def find_best_match(typo): 203 return min( 204 [(threshold + 1, typo)] + [(levenshtein(typo, d), d) 205 for d in all_directives 206 if abs(len(d) - len(typo)) <= threshold], 207 key=lambda tup: tup[0], 208 ) 209 210 potential_directives = find_potential_directives(content) 211 212 for filerange, potential_directive in potential_directives: 213 # TODO(bchetioui): match count directives more finely. We skip directives 214 # starting with 'CHECK-COUNT-' for the moment as they require more complex 215 # logic to be handled correctly. 216 if any( 217 potential_directive.startswith(f'{prefix}-COUNT-') 218 for prefix in all_prefixes): 219 continue 220 221 # Ignoring potential typos that will not be matched later due to a too low 222 # threshold, in order to avoid potentially long computation times. 223 if len(potential_directive) > max(map(len, all_directives)) + threshold: 224 continue 225 226 score, best_match = find_best_match(potential_directive) 227 if score == 0: # This is an actual directive, ignore. 228 continue 229 elif score <= threshold and best_match not in _ignore: 230 yield Diagnostic(filepath, filerange, potential_directive, best_match) 231 232 233def main(argv: Sequence[str]): 234 if len(argv) < 2: 235 print(f'Usage: {argv[0]} path/to/file/1 ... path/to/file/n') 236 exit(1) 237 238 for filepath in argv[1:]: 239 logging.info('Checking %s', filepath) 240 with open(filepath, 'rt') as f: 241 content = f.read() 242 for diagnostic in find_directive_typos( 243 content, 244 pathlib.Path(filepath), 245 threshold=_distance_threshold, 246 ): 247 print(diagnostic) 248 249 250if __name__ == '__main__': 251 main(sys.argv) 252