#!/usr/bin/env python3 # ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # # ===----------------------------------------------------------------------===## """A linter that detects potential typos in FileCheck directive names. Consider a broken test foo.cpp: // RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW // RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD auto x = 42; // NEWW: auto is a c++11 extension // ODL-NOT: auto is a c++11 extension We first detect the locally valid FileCheck directive prefixes by parsing the --check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are {CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}. Then we look for lines that look like directives. These are of the form 'FOO:', usually at the beginning of a line or a comment. If any of these are a "near-miss" for a directive name, then we suspect this is a typo and report it. Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n """ import itertools import logging import pathlib import re import sys from typing import Generator, Sequence, Tuple _distance_threshold = 3 _prefixes = {"CHECK"} _suffixes = {"-DAG", "-COUNT", "-EMPTY", "-LABEL", "-NEXT", "-NOT", "-SAME"} # 'NOTE' and 'TODO' are not directives, but are likely to be false positives # if encountered and to generate noise as a result. We filter them out also to # avoid this. _lit_directives = { "RUN", "REQUIRES", "UNSUPPORTED", "XFAIL", "DEFINE", "REDEFINE", } # 'COM' and 'RUN' are default comment prefixes for FileCheck. _comment_prefixes = {"COM", "RUN"} _ignore = _lit_directives.union(_comment_prefixes).union({"NOTE", "TODO"}) def levenshtein(s1: str, s2: str) -> int: # pylint: disable=g-doc-args """Computes the edit distance between two strings. Additions, deletions, and substitutions all count as a single operation. """ if not s1: return len(s2) if not s2: return len(s1) distances = range(len(s2) + 1) for i in range(len(s1)): new_distances = [i + 1] for j in range(len(s2)): cost = min( distances[j] + int(s1[i] != s2[j]), distances[j + 1] + 1, new_distances[-1] + 1, ) new_distances.append(cost) distances = new_distances return distances[-1] class FileRange: """Stores the coordinates of a span on a single line within a file. Attributes: content: line str start_byte: the (inclusive) byte offset the span starts end_byte: the (inclusive) byte offset the span ends """ content: str start_byte: int end_byte: int def __init__( self, content: str, start_byte: int, end_byte: int ): # pylint: disable=g-doc-args """ Stores the coordinates of a span based on a string and start/end bytes. `start_byte` and `end_byte` are assumed to be on the same line. """ self.content = content self.start_byte = start_byte self.end_byte = end_byte def as_str(self): """ Derives span from line and coordinates. start_column: the (inclusive) column where the span starts end_column: the (inclusive) column where the span ends """ content_before_span = self.content[: self.start_byte] line = content_before_span.count("\n") + 1 start_column = self.start_byte - content_before_span.rfind("\n") end_column = start_column + (self.end_byte - self.start_byte - 1) return f"{line}:{start_column}-{end_column}" class Diagnostic: """Stores information about one typo and a suggested fix. Attributes: filepath: the path to the file in which the typo was found filerange: the position at which the typo was found in the file typo: the typo fix: a suggested fix """ filepath: pathlib.Path filerange: FileRange typo: str fix: str def __init__( self, filepath: pathlib.Path, filerange: FileRange, typo: str, fix: str, # pylint: disable=redefined-outer-name ): self.filepath = filepath self.filerange = filerange self.typo = typo self.fix = fix def __str__(self) -> str: return f"{self.filepath}:" + self.filerange.as_str() + f": {self.summary()}" def summary(self) -> str: return ( f'Found potentially misspelled directive "{self.typo}". Did you mean ' f'"{self.fix}"?' ) def find_potential_directives( content: str, ) -> Generator[Tuple[FileRange, str], None, None]: """Extracts all the potential FileCheck directives from a string. What constitutes a potential directive is loosely defined---we err on the side of capturing more strings than is necessary, rather than missing any. Args: content: the string in which to look for directives Yields: Tuples (p, d) where p is the span where the potential directive occurs within the string and d is the potential directive. """ directive_pattern = re.compile( r"(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):", re.MULTILINE ) for match in re.finditer(directive_pattern, content): potential_directive, span = match.group(1), match.span(1) yield (FileRange(content, span[0], span[1]), potential_directive) # TODO(bchetioui): also parse comment prefixes to ignore. def parse_custom_prefixes( content: str, ) -> Generator[str, None, None]: # pylint: disable=g-doc-args """Parses custom prefixes defined in the string provided. For example, given the following file content: RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2 RUN: something_else | FileCheck %s -check-prefix 'CHECK3' the custom prefixes are CHECK1, CHECK2, and CHECK3. """ param_re = r"|".join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+']) for m in re.finditer( r"-check-prefix(?:es)?(?:\s+|=)({})".format(param_re), content ): prefixes = m.group(1) if prefixes.startswith("'") or prefixes.startswith('"'): prefixes = prefixes[1:-1] for prefix in prefixes.split(","): yield prefix def find_directive_typos( content: str, filepath: pathlib.Path, threshold: int = 3, ) -> Generator[Diagnostic, None, None]: """Detects potential typos in FileCheck directives. Args: content: the content of the file filepath: the path to the file to check for typos in directives threshold: the (inclusive) maximum edit distance between a potential directive and an actual directive, such that the potential directive is classified as a typo Yields: Diagnostics, in order from the top of the file. """ all_prefixes = _prefixes.union(set(parse_custom_prefixes(content))) all_directives = ( [ f"{prefix}{suffix}" for prefix, suffix in itertools.product(all_prefixes, _suffixes) ] + list(_ignore) + list(all_prefixes) ) def find_best_match(typo): return min( [(threshold + 1, typo)] + [ (levenshtein(typo, d), d) for d in all_directives if abs(len(d) - len(typo)) <= threshold ], key=lambda tup: tup[0], ) potential_directives = find_potential_directives(content) # Cache score and best_match to skip recalculating. score_and_best_match_for_potential_directive = dict() for filerange, potential_directive in potential_directives: # TODO(bchetioui): match count directives more finely. We skip directives # starting with 'CHECK-COUNT-' for the moment as they require more complex # logic to be handled correctly. if any( potential_directive.startswith(f"{prefix}-COUNT-") for prefix in all_prefixes ): continue # Ignoring potential typos that will not be matched later due to a too low # threshold, in order to avoid potentially long computation times. if len(potential_directive) > max(map(len, all_directives)) + threshold: continue if potential_directive not in score_and_best_match_for_potential_directive: score, best_match = find_best_match(potential_directive) score_and_best_match_for_potential_directive[potential_directive] = ( score, best_match, ) else: score, best_match = score_and_best_match_for_potential_directive[ potential_directive ] if score == 0: # This is an actual directive, ignore. continue elif score <= threshold and best_match not in _ignore: yield Diagnostic(filepath, filerange, potential_directive, best_match) def main(argv: Sequence[str]): if len(argv) < 2: print(f"Usage: {argv[0]} path/to/file/1 ... path/to/file/n") exit(1) for filepath in argv[1:]: logging.info("Checking %s", filepath) with open(filepath, "rt") as f: content = f.read() for diagnostic in find_directive_typos( content, pathlib.Path(filepath), threshold=_distance_threshold, ): print(diagnostic) if __name__ == "__main__": main(sys.argv)