1#!/usr/bin/env python3 2# A tool to parse ASTMatchers.h and update the documentation in 3# ../LibASTMatchersReference.html automatically. Run from the 4# directory in which this file is located to update the docs. 5 6import collections 7import re 8import os 9 10try: 11 from urllib.request import urlopen 12except ImportError: 13 from urllib2 import urlopen 14 15CLASS_INDEX_PAGE_URL = "https://clang.llvm.org/doxygen/classes.html" 16try: 17 CLASS_INDEX_PAGE = urlopen(CLASS_INDEX_PAGE_URL).read().decode("utf-8") 18except Exception as e: 19 CLASS_INDEX_PAGE = None 20 print("Unable to get %s: %s" % (CLASS_INDEX_PAGE_URL, e)) 21 22CURRENT_DIR = os.path.dirname(__file__) 23MATCHERS_FILE = os.path.join( 24 CURRENT_DIR, "../../include/clang/ASTMatchers/ASTMatchers.h" 25) 26HTML_FILE = os.path.join(CURRENT_DIR, "../LibASTMatchersReference.html") 27 28# Each matcher is documented in one row of the form: 29# result | name | argA 30# The subsequent row contains the documentation and is hidden by default, 31# becoming visible via javascript when the user clicks the matcher name. 32TD_TEMPLATE = """ 33<tr><td>%(result)s</td><td class="name" onclick="toggle('%(id)s')"><a name="%(id)sAnchor">%(name)s</a></td><td>%(args)s</td></tr> 34<tr><td colspan="4" class="doc" id="%(id)s"><pre>%(comment)s</pre></td></tr> 35""" 36 37# We categorize the matchers into these three categories in the reference: 38node_matchers = {} 39narrowing_matchers = {} 40traversal_matchers = {} 41 42# We output multiple rows per matcher if the matcher can be used on multiple 43# node types. Thus, we need a new id per row to control the documentation 44# pop-up. ids[name] keeps track of those ids. 45ids = collections.defaultdict(int) 46 47# Cache for doxygen urls we have already verified. 48doxygen_probes = {} 49 50 51def esc(text): 52 """Escape any html in the given text.""" 53 text = re.sub(r"&", "&", text) 54 text = re.sub(r"<", "<", text) 55 text = re.sub(r">", ">", text) 56 57 def link_if_exists(m): 58 """Wrap a likely AST node name in a link to its clang docs. 59 60 We want to do this only if the page exists, in which case it will be 61 referenced from the class index page. 62 """ 63 name = m.group(1) 64 url = "https://clang.llvm.org/doxygen/classclang_1_1%s.html" % name 65 if url not in doxygen_probes: 66 search_str = 'href="classclang_1_1%s.html"' % name 67 if CLASS_INDEX_PAGE is not None: 68 doxygen_probes[url] = search_str in CLASS_INDEX_PAGE 69 else: 70 doxygen_probes[url] = True 71 if not doxygen_probes[url]: 72 print("Did not find %s in class index page" % name) 73 if doxygen_probes[url]: 74 return r'Matcher<<a href="%s">%s</a>>' % (url, name) 75 else: 76 return m.group(0) 77 78 text = re.sub(r"Matcher<([^\*&]+)>", link_if_exists, text) 79 return text 80 81 82def extract_result_types(comment): 83 """Extracts a list of result types from the given comment. 84 85 We allow annotations in the comment of the matcher to specify what 86 nodes a matcher can match on. Those comments have the form: 87 Usable as: Any Matcher | (Matcher<T1>[, Matcher<t2>[, ...]]) 88 89 Returns ['*'] in case of 'Any Matcher', or ['T1', 'T2', ...]. 90 Returns the empty list if no 'Usable as' specification could be 91 parsed. 92 """ 93 result_types = [] 94 m = re.search(r"Usable as: Any Matcher[\s\n]*$", comment, re.S) 95 if m: 96 return ["*"] 97 while True: 98 m = re.match(r"^(.*)Matcher<([^>]+)>\s*,?[\s\n]*$", comment, re.S) 99 if not m: 100 if re.search(r"Usable as:\s*$", comment): 101 return result_types 102 else: 103 return None 104 result_types += [m.group(2)] 105 comment = m.group(1) 106 107 108def strip_doxygen(comment): 109 """Returns the given comment without \-escaped words.""" 110 # If there is only a doxygen keyword in the line, delete the whole line. 111 comment = re.sub(r"^\\[^\s]+\n", r"", comment, flags=re.M) 112 113 # If there is a doxygen \see command, change the \see prefix into "See also:". 114 # FIXME: it would be better to turn this into a link to the target instead. 115 comment = re.sub(r"\\see", r"See also:", comment) 116 117 # Delete the doxygen command and the following whitespace. 118 comment = re.sub(r"\\[^\s]+\s+", r"", comment) 119 return comment 120 121 122def unify_arguments(args): 123 """Gets rid of anything the user doesn't care about in the argument list.""" 124 args = re.sub(r"clang::ast_matchers::internal::", r"", args) 125 args = re.sub(r"ast_matchers::internal::", r"", args) 126 args = re.sub(r"internal::", r"", args) 127 args = re.sub(r"extern const\s+(.*)&", r"\1 ", args) 128 args = re.sub(r"&", r" ", args) 129 args = re.sub(r"(^|\s)M\d?(\s)", r"\1Matcher<*>\2", args) 130 args = re.sub(r"BindableMatcher", r"Matcher", args) 131 args = re.sub(r"const Matcher", r"Matcher", args) 132 return args 133 134 135def unify_type(result_type): 136 """Gets rid of anything the user doesn't care about in the type name.""" 137 result_type = re.sub( 138 r"^internal::(Bindable)?Matcher<([a-zA-Z_][a-zA-Z0-9_]*)>$", r"\2", result_type 139 ) 140 return result_type 141 142 143def add_matcher(result_type, name, args, comment, is_dyncast=False): 144 """Adds a matcher to one of our categories.""" 145 if name == "id": 146 # FIXME: Figure out whether we want to support the 'id' matcher. 147 return 148 matcher_id = "%s%d" % (name, ids[name]) 149 ids[name] += 1 150 args = unify_arguments(args) 151 result_type = unify_type(result_type) 152 153 docs_result_type = esc("Matcher<%s>" % result_type) 154 155 if name == "mapAnyOf": 156 args = "nodeMatcherFunction..." 157 docs_result_type = "<em>unspecified</em>" 158 159 matcher_html = TD_TEMPLATE % { 160 "result": docs_result_type, 161 "name": name, 162 "args": esc(args), 163 "comment": esc(strip_doxygen(comment)), 164 "id": matcher_id, 165 } 166 if is_dyncast: 167 dict = node_matchers 168 lookup = result_type + name 169 # Use a heuristic to figure out whether a matcher is a narrowing or 170 # traversal matcher. By default, matchers that take other matchers as 171 # arguments (and are not node matchers) do traversal. We specifically 172 # exclude known narrowing matchers that also take other matchers as 173 # arguments. 174 elif "Matcher<" not in args or name in [ 175 "allOf", 176 "anyOf", 177 "anything", 178 "unless", 179 "mapAnyOf", 180 ]: 181 dict = narrowing_matchers 182 lookup = result_type + name + esc(args) 183 else: 184 dict = traversal_matchers 185 lookup = result_type + name + esc(args) 186 187 if dict.get(lookup) is None or len(dict.get(lookup)) < len(matcher_html): 188 dict[lookup] = matcher_html 189 190 191def act_on_decl(declaration, comment, allowed_types): 192 """Parse the matcher out of the given declaration and comment. 193 194 If 'allowed_types' is set, it contains a list of node types the matcher 195 can match on, as extracted from the static type asserts in the matcher 196 definition. 197 """ 198 if declaration.strip(): 199 200 if re.match(r"^\s?(#|namespace|using|template <typename NodeType> using|})", declaration): 201 return 202 203 # Node matchers are defined by writing: 204 # VariadicDynCastAllOfMatcher<ResultType, ArgumentType> name; 205 m = re.match( 206 r""".*Variadic(?:DynCast)?AllOfMatcher\s*< 207 \s*([^\s,]+)\s*(?:, 208 \s*([^\s>]+)\s*)?> 209 \s*([^\s;]+)\s*;\s*$""", 210 declaration, 211 flags=re.X, 212 ) 213 if m: 214 result, inner, name = m.groups() 215 if not inner: 216 inner = result 217 add_matcher( 218 result, name, "Matcher<%s>..." % inner, comment, is_dyncast=True 219 ) 220 return 221 222 # Special case of type matchers: 223 # AstTypeMatcher<ArgumentType> name 224 m = re.match( 225 r""".*AstTypeMatcher\s*< 226 \s*([^\s>]+)\s*> 227 \s*([^\s;]+)\s*;\s*$""", 228 declaration, 229 flags=re.X, 230 ) 231 if m: 232 inner, name = m.groups() 233 add_matcher( 234 "Type", name, "Matcher<%s>..." % inner, comment, is_dyncast=True 235 ) 236 # FIXME: re-enable once we have implemented casting on the TypeLoc 237 # hierarchy. 238 # add_matcher('TypeLoc', '%sLoc' % name, 'Matcher<%sLoc>...' % inner, 239 # comment, is_dyncast=True) 240 return 241 242 # Parse the various matcher definition macros. 243 m = re.match( 244 """.*AST_TYPE(LOC)?_TRAVERSE_MATCHER(?:_DECL)?\( 245 \s*([^\s,]+\s*), 246 \s*(?:[^\s,]+\s*), 247 \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\) 248 \)\s*;\s*$""", 249 declaration, 250 flags=re.X, 251 ) 252 if m: 253 loc, name, results = m.groups()[0:3] 254 result_types = [r.strip() for r in results.split(",")] 255 256 comment_result_types = extract_result_types(comment) 257 if comment_result_types and sorted(result_types) != sorted( 258 comment_result_types 259 ): 260 raise Exception("Inconsistent documentation for: %s" % name) 261 for result_type in result_types: 262 add_matcher(result_type, name, "Matcher<Type>", comment) 263 # if loc: 264 # add_matcher('%sLoc' % result_type, '%sLoc' % name, 'Matcher<TypeLoc>', 265 # comment) 266 return 267 268 m = re.match( 269 r"""^\s*AST_POLYMORPHIC_MATCHER(_P)?(.?)(?:_OVERLOAD)?\( 270 \s*([^\s,]+)\s*, 271 \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\) 272 (?:,\s*([^\s,]+)\s* 273 ,\s*([^\s,]+)\s*)? 274 (?:,\s*([^\s,]+)\s* 275 ,\s*([^\s,]+)\s*)? 276 (?:,\s*\d+\s*)? 277 \)\s*{\s*$""", 278 declaration, 279 flags=re.X, 280 ) 281 282 if m: 283 p, n, name, results = m.groups()[0:4] 284 args = m.groups()[4:] 285 result_types = [r.strip() for r in results.split(",")] 286 if allowed_types and allowed_types != result_types: 287 raise Exception("Inconsistent documentation for: %s" % name) 288 if n not in ["", "2"]: 289 raise Exception('Cannot parse "%s"' % declaration) 290 args = ", ".join( 291 "%s %s" % (args[i], args[i + 1]) 292 for i in range(0, len(args), 2) 293 if args[i] 294 ) 295 for result_type in result_types: 296 add_matcher(result_type, name, args, comment) 297 return 298 299 m = re.match( 300 r"""^\s*AST_POLYMORPHIC_MATCHER_REGEX(?:_OVERLOAD)?\( 301 \s*([^\s,]+)\s*, 302 \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\), 303 \s*([^\s,]+)\s* 304 (?:,\s*\d+\s*)? 305 \)\s*{\s*$""", 306 declaration, 307 flags=re.X, 308 ) 309 310 if m: 311 name, results, arg_name = m.groups()[0:3] 312 result_types = [r.strip() for r in results.split(",")] 313 if allowed_types and allowed_types != result_types: 314 raise Exception("Inconsistent documentation for: %s" % name) 315 arg = "StringRef %s, Regex::RegexFlags Flags = NoFlags" % arg_name 316 comment += """ 317If the matcher is used in clang-query, RegexFlags parameter 318should be passed as a quoted string. e.g: "NoFlags". 319Flags can be combined with '|' example \"IgnoreCase | BasicRegex\" 320""" 321 for result_type in result_types: 322 add_matcher(result_type, name, arg, comment) 323 return 324 325 m = re.match( 326 r"""^\s*AST_MATCHER_FUNCTION(_P)?(.?)(?:_OVERLOAD)?\( 327 (?:\s*([^\s,]+)\s*,)? 328 \s*([^\s,]+)\s* 329 (?:,\s*([^\s,]+)\s* 330 ,\s*([^\s,]+)\s*)? 331 (?:,\s*([^\s,]+)\s* 332 ,\s*([^\s,]+)\s*)? 333 (?:,\s*\d+\s*)? 334 \)\s*{\s*$""", 335 declaration, 336 flags=re.X, 337 ) 338 if m: 339 p, n, result, name = m.groups()[0:4] 340 args = m.groups()[4:] 341 if n not in ["", "2"]: 342 raise Exception('Cannot parse "%s"' % declaration) 343 args = ", ".join( 344 "%s %s" % (args[i], args[i + 1]) 345 for i in range(0, len(args), 2) 346 if args[i] 347 ) 348 add_matcher(result, name, args, comment) 349 return 350 351 m = re.match( 352 r"""^\s*AST_MATCHER(_P)?(.?)(?:_OVERLOAD)?\( 353 (?:\s*([^\s,]+)\s*,)? 354 \s*([^\s,]+)\s* 355 (?:,\s*([^,]+)\s* 356 ,\s*([^\s,]+)\s*)? 357 (?:,\s*([^\s,]+)\s* 358 ,\s*([^\s,]+)\s*)? 359 (?:,\s*\d+\s*)? 360 \)\s*{""", 361 declaration, 362 flags=re.X, 363 ) 364 if m: 365 p, n, result, name = m.groups()[0:4] 366 args = m.groups()[4:] 367 if not result: 368 if not allowed_types: 369 raise Exception("Did not find allowed result types for: %s" % name) 370 result_types = allowed_types 371 else: 372 result_types = [result] 373 if n not in ["", "2"]: 374 raise Exception('Cannot parse "%s"' % declaration) 375 args = ", ".join( 376 "%s %s" % (args[i], args[i + 1]) 377 for i in range(0, len(args), 2) 378 if args[i] 379 ) 380 for result_type in result_types: 381 add_matcher(result_type, name, args, comment) 382 return 383 384 m = re.match( 385 r"""^\s*AST_MATCHER_REGEX(?:_OVERLOAD)?\( 386 \s*([^\s,]+)\s*, 387 \s*([^\s,]+)\s*, 388 \s*([^\s,]+)\s* 389 (?:,\s*\d+\s*)? 390 \)\s*{""", 391 declaration, 392 flags=re.X, 393 ) 394 if m: 395 result, name, arg_name = m.groups()[0:3] 396 if not result: 397 if not allowed_types: 398 raise Exception("Did not find allowed result types for: %s" % name) 399 result_types = allowed_types 400 else: 401 result_types = [result] 402 arg = "StringRef %s, Regex::RegexFlags Flags = NoFlags" % arg_name 403 comment += """ 404If the matcher is used in clang-query, RegexFlags parameter 405should be passed as a quoted string. e.g: "NoFlags". 406Flags can be combined with '|' example \"IgnoreCase | BasicRegex\" 407""" 408 409 for result_type in result_types: 410 add_matcher(result_type, name, arg, comment) 411 return 412 413 # Parse ArgumentAdapting matchers. 414 m = re.match( 415 r"""^.*ArgumentAdaptingMatcherFunc<.*>\s* 416 ([a-zA-Z]*);$""", 417 declaration, 418 flags=re.X, 419 ) 420 if m: 421 name = m.groups()[0] 422 add_matcher("*", name, "Matcher<*>", comment) 423 return 424 425 # Parse Variadic functions. 426 m = re.match( 427 r"""^.*internal::VariadicFunction\s*<\s*([^,]+),\s*([^,]+),\s*[^>]+>\s* 428 ([a-zA-Z]*);$""", 429 declaration, 430 flags=re.X, 431 ) 432 if m: 433 result, arg, name = m.groups()[:3] 434 add_matcher(result, name, "%s, ..., %s" % (arg, arg), comment) 435 return 436 437 m = re.match( 438 r"""^.*internal::VariadicFunction\s*<\s* 439 internal::PolymorphicMatcher<[\S\s]+ 440 AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\),\s*(.*);$""", 441 declaration, 442 flags=re.X, 443 ) 444 445 if m: 446 results, trailing = m.groups() 447 trailing, name = trailing.rsplit(">", 1) 448 name = name.strip() 449 trailing, _ = trailing.rsplit(",", 1) 450 _, arg = trailing.rsplit(",", 1) 451 arg = arg.strip() 452 453 result_types = [r.strip() for r in results.split(",")] 454 for result_type in result_types: 455 add_matcher(result_type, name, "%s, ..., %s" % (arg, arg), comment) 456 return 457 458 # Parse Variadic operator matchers. 459 m = re.match( 460 r"""^.*VariadicOperatorMatcherFunc\s*<\s*([^,]+),\s*([^\s]+)\s*>\s* 461 ([a-zA-Z]*);$""", 462 declaration, 463 flags=re.X, 464 ) 465 if m: 466 min_args, max_args, name = m.groups()[:3] 467 if max_args == "1": 468 add_matcher("*", name, "Matcher<*>", comment) 469 return 470 elif max_args == "std::numeric_limits<unsigned>::max()": 471 add_matcher("*", name, "Matcher<*>, ..., Matcher<*>", comment) 472 return 473 474 m = re.match( 475 r"""^.*MapAnyOfMatcher<.*>\s* 476 ([a-zA-Z]*);$""", 477 declaration, 478 flags=re.X, 479 ) 480 if m: 481 name = m.groups()[0] 482 add_matcher("*", name, "Matcher<*>...Matcher<*>", comment) 483 return 484 485 # Parse free standing matcher functions, like: 486 # Matcher<ResultType> Name(Matcher<ArgumentType> InnerMatcher) { 487 m = re.match( 488 r"""^\s*(?:template\s+<\s*(?:class|typename)\s+(.+)\s*>\s+)? 489 (.*)\s+ 490 ([^\s\(]+)\s*\( 491 (.*) 492 \)\s*{""", 493 declaration, 494 re.X, 495 ) 496 if m: 497 template_name, result, name, args = m.groups() 498 if template_name: 499 matcherTemplateArgs = re.findall( 500 r"Matcher<\s*(%s)\s*>" % template_name, args 501 ) 502 templateArgs = re.findall( 503 r"(?:^|[\s,<])(%s)(?:$|[\s,>])" % template_name, args 504 ) 505 if len(matcherTemplateArgs) < len(templateArgs): 506 # The template name is used naked, so don't replace with `*`` later on 507 template_name = None 508 else: 509 args = re.sub( 510 r"(^|[\s,<])%s($|[\s,>])" % template_name, r"\1*\2", args 511 ) 512 args = ", ".join(p.strip() for p in args.split(",")) 513 m = re.match(r"(?:^|.*\s+)internal::(?:Bindable)?Matcher<([^>]+)>$", result) 514 if m: 515 result_types = [m.group(1)] 516 if ( 517 template_name 518 and len(result_types) == 1 519 and result_types[0] == template_name 520 ): 521 result_types = ["*"] 522 else: 523 result_types = extract_result_types(comment) 524 if not result_types: 525 if not comment: 526 # Only overloads don't have their own doxygen comments; ignore those. 527 print('Ignoring "%s"' % name) 528 else: 529 print('Cannot determine result type for "%s"' % name) 530 else: 531 for result_type in result_types: 532 add_matcher(result_type, name, args, comment) 533 else: 534 print('*** Unparsable: "' + declaration + '" ***') 535 536 537def sort_table(matcher_type, matcher_map): 538 """Returns the sorted html table for the given row map.""" 539 table = "" 540 for key in sorted(matcher_map.keys()): 541 table += matcher_map[key] + "\n" 542 return ( 543 "<!-- START_%(type)s_MATCHERS -->\n" 544 + "%(table)s" 545 + "<!--END_%(type)s_MATCHERS -->" 546 ) % { 547 "type": matcher_type, 548 "table": table, 549 } 550 551 552# Parse the ast matchers. 553# We alternate between two modes: 554# body = True: We parse the definition of a matcher. We need 555# to parse the full definition before adding a matcher, as the 556# definition might contain static asserts that specify the result 557# type. 558# body = False: We parse the comments and declaration of the matcher. 559comment = "" 560declaration = "" 561allowed_types = [] 562body = False 563for line in open(MATCHERS_FILE).read().splitlines(): 564 if body: 565 if line.strip() and line[0] == "}": 566 if declaration: 567 act_on_decl(declaration, comment, allowed_types) 568 comment = "" 569 declaration = "" 570 allowed_types = [] 571 body = False 572 else: 573 m = re.search(r"is_base_of<([^,]+), NodeType>", line) 574 if m and m.group(1): 575 allowed_types += [m.group(1)] 576 continue 577 if line.strip() and line.lstrip()[0] == "/": 578 comment += re.sub(r"^/+\s?", "", line) + "\n" 579 else: 580 declaration += " " + line 581 if ( 582 (not line.strip()) 583 or line.rstrip()[-1] == ";" 584 or (line.rstrip()[-1] == "{" and line.rstrip()[-3:] != "= {") 585 ): 586 if line.strip() and line.rstrip()[-1] == "{": 587 body = True 588 else: 589 act_on_decl(declaration, comment, allowed_types) 590 comment = "" 591 declaration = "" 592 allowed_types = [] 593 594node_matcher_table = sort_table("DECL", node_matchers) 595narrowing_matcher_table = sort_table("NARROWING", narrowing_matchers) 596traversal_matcher_table = sort_table("TRAVERSAL", traversal_matchers) 597 598reference = open(HTML_FILE).read() 599reference = re.sub( 600 r"<!-- START_DECL_MATCHERS.*END_DECL_MATCHERS -->", 601 node_matcher_table, 602 reference, 603 flags=re.S, 604) 605reference = re.sub( 606 r"<!-- START_NARROWING_MATCHERS.*END_NARROWING_MATCHERS -->", 607 narrowing_matcher_table, 608 reference, 609 flags=re.S, 610) 611reference = re.sub( 612 r"<!-- START_TRAVERSAL_MATCHERS.*END_TRAVERSAL_MATCHERS -->", 613 traversal_matcher_table, 614 reference, 615 flags=re.S, 616) 617 618with open("../LibASTMatchersReference.html", "w", newline="\n") as output: 619 output.write(reference) 620