xref: /llvm-project/clang/docs/tools/dump_ast_matchers.py (revision b5b15c1973935da943e8cee26dc961c6dbe339b9)
1#!/usr/bin/env python3
2# A tool to parse ASTMatchers.h and update the documentation in
3# ../LibASTMatchersReference.html automatically. Run from the
4# directory in which this file is located to update the docs.
5
6import collections
7import re
8import os
9
10try:
11    from urllib.request import urlopen
12except ImportError:
13    from urllib2 import urlopen
14
15CLASS_INDEX_PAGE_URL = "https://clang.llvm.org/doxygen/classes.html"
16try:
17    CLASS_INDEX_PAGE = urlopen(CLASS_INDEX_PAGE_URL).read().decode("utf-8")
18except Exception as e:
19    CLASS_INDEX_PAGE = None
20    print("Unable to get %s: %s" % (CLASS_INDEX_PAGE_URL, e))
21
22CURRENT_DIR = os.path.dirname(__file__)
23MATCHERS_FILE = os.path.join(
24    CURRENT_DIR, "../../include/clang/ASTMatchers/ASTMatchers.h"
25)
26HTML_FILE = os.path.join(CURRENT_DIR, "../LibASTMatchersReference.html")
27
28# Each matcher is documented in one row of the form:
29#   result | name | argA
30# The subsequent row contains the documentation and is hidden by default,
31# becoming visible via javascript when the user clicks the matcher name.
32TD_TEMPLATE = """
33<tr><td>%(result)s</td><td class="name" onclick="toggle('%(id)s')"><a name="%(id)sAnchor">%(name)s</a></td><td>%(args)s</td></tr>
34<tr><td colspan="4" class="doc" id="%(id)s"><pre>%(comment)s</pre></td></tr>
35"""
36
37# We categorize the matchers into these three categories in the reference:
38node_matchers = {}
39narrowing_matchers = {}
40traversal_matchers = {}
41
42# We output multiple rows per matcher if the matcher can be used on multiple
43# node types. Thus, we need a new id per row to control the documentation
44# pop-up. ids[name] keeps track of those ids.
45ids = collections.defaultdict(int)
46
47# Cache for doxygen urls we have already verified.
48doxygen_probes = {}
49
50
51def esc(text):
52    """Escape any html in the given text."""
53    text = re.sub(r"&", "&amp;", text)
54    text = re.sub(r"<", "&lt;", text)
55    text = re.sub(r">", "&gt;", text)
56
57    def link_if_exists(m):
58        """Wrap a likely AST node name in a link to its clang docs.
59
60        We want to do this only if the page exists, in which case it will be
61        referenced from the class index page.
62        """
63        name = m.group(1)
64        url = "https://clang.llvm.org/doxygen/classclang_1_1%s.html" % name
65        if url not in doxygen_probes:
66            search_str = 'href="classclang_1_1%s.html"' % name
67            if CLASS_INDEX_PAGE is not None:
68                doxygen_probes[url] = search_str in CLASS_INDEX_PAGE
69            else:
70                doxygen_probes[url] = True
71            if not doxygen_probes[url]:
72                print("Did not find %s in class index page" % name)
73        if doxygen_probes[url]:
74            return r'Matcher&lt;<a href="%s">%s</a>&gt;' % (url, name)
75        else:
76            return m.group(0)
77
78    text = re.sub(r"Matcher&lt;([^\*&]+)&gt;", link_if_exists, text)
79    return text
80
81
82def extract_result_types(comment):
83    """Extracts a list of result types from the given comment.
84
85    We allow annotations in the comment of the matcher to specify what
86    nodes a matcher can match on. Those comments have the form:
87      Usable as: Any Matcher | (Matcher<T1>[, Matcher<t2>[, ...]])
88
89    Returns ['*'] in case of 'Any Matcher', or ['T1', 'T2', ...].
90    Returns the empty list if no 'Usable as' specification could be
91    parsed.
92    """
93    result_types = []
94    m = re.search(r"Usable as: Any Matcher[\s\n]*$", comment, re.S)
95    if m:
96        return ["*"]
97    while True:
98        m = re.match(r"^(.*)Matcher<([^>]+)>\s*,?[\s\n]*$", comment, re.S)
99        if not m:
100            if re.search(r"Usable as:\s*$", comment):
101                return result_types
102            else:
103                return None
104        result_types += [m.group(2)]
105        comment = m.group(1)
106
107
108def strip_doxygen(comment):
109    """Returns the given comment without \-escaped words."""
110    # If there is only a doxygen keyword in the line, delete the whole line.
111    comment = re.sub(r"^\\[^\s]+\n", r"", comment, flags=re.M)
112
113    # If there is a doxygen \see command, change the \see prefix into "See also:".
114    # FIXME: it would be better to turn this into a link to the target instead.
115    comment = re.sub(r"\\see", r"See also:", comment)
116
117    # Delete the doxygen command and the following whitespace.
118    comment = re.sub(r"\\[^\s]+\s+", r"", comment)
119    return comment
120
121
122def unify_arguments(args):
123    """Gets rid of anything the user doesn't care about in the argument list."""
124    args = re.sub(r"clang::ast_matchers::internal::", r"", args)
125    args = re.sub(r"ast_matchers::internal::", r"", args)
126    args = re.sub(r"internal::", r"", args)
127    args = re.sub(r"extern const\s+(.*)&", r"\1 ", args)
128    args = re.sub(r"&", r" ", args)
129    args = re.sub(r"(^|\s)M\d?(\s)", r"\1Matcher<*>\2", args)
130    args = re.sub(r"BindableMatcher", r"Matcher", args)
131    args = re.sub(r"const Matcher", r"Matcher", args)
132    return args
133
134
135def unify_type(result_type):
136    """Gets rid of anything the user doesn't care about in the type name."""
137    result_type = re.sub(
138        r"^internal::(Bindable)?Matcher<([a-zA-Z_][a-zA-Z0-9_]*)>$", r"\2", result_type
139    )
140    return result_type
141
142
143def add_matcher(result_type, name, args, comment, is_dyncast=False):
144    """Adds a matcher to one of our categories."""
145    if name == "id":
146        # FIXME: Figure out whether we want to support the 'id' matcher.
147        return
148    matcher_id = "%s%d" % (name, ids[name])
149    ids[name] += 1
150    args = unify_arguments(args)
151    result_type = unify_type(result_type)
152
153    docs_result_type = esc("Matcher<%s>" % result_type)
154
155    if name == "mapAnyOf":
156        args = "nodeMatcherFunction..."
157        docs_result_type = "<em>unspecified</em>"
158
159    matcher_html = TD_TEMPLATE % {
160        "result": docs_result_type,
161        "name": name,
162        "args": esc(args),
163        "comment": esc(strip_doxygen(comment)),
164        "id": matcher_id,
165    }
166    if is_dyncast:
167        dict = node_matchers
168        lookup = result_type + name
169    # Use a heuristic to figure out whether a matcher is a narrowing or
170    # traversal matcher. By default, matchers that take other matchers as
171    # arguments (and are not node matchers) do traversal. We specifically
172    # exclude known narrowing matchers that also take other matchers as
173    # arguments.
174    elif "Matcher<" not in args or name in [
175        "allOf",
176        "anyOf",
177        "anything",
178        "unless",
179        "mapAnyOf",
180    ]:
181        dict = narrowing_matchers
182        lookup = result_type + name + esc(args)
183    else:
184        dict = traversal_matchers
185        lookup = result_type + name + esc(args)
186
187    if dict.get(lookup) is None or len(dict.get(lookup)) < len(matcher_html):
188        dict[lookup] = matcher_html
189
190
191def act_on_decl(declaration, comment, allowed_types):
192    """Parse the matcher out of the given declaration and comment.
193
194    If 'allowed_types' is set, it contains a list of node types the matcher
195    can match on, as extracted from the static type asserts in the matcher
196    definition.
197    """
198    if declaration.strip():
199
200        if re.match(r"^\s?(#|namespace|using|template <typename NodeType> using|})", declaration):
201            return
202
203        # Node matchers are defined by writing:
204        #   VariadicDynCastAllOfMatcher<ResultType, ArgumentType> name;
205        m = re.match(
206            r""".*Variadic(?:DynCast)?AllOfMatcher\s*<
207                       \s*([^\s,]+)\s*(?:,
208                       \s*([^\s>]+)\s*)?>
209                       \s*([^\s;]+)\s*;\s*$""",
210            declaration,
211            flags=re.X,
212        )
213        if m:
214            result, inner, name = m.groups()
215            if not inner:
216                inner = result
217            add_matcher(
218                result, name, "Matcher<%s>..." % inner, comment, is_dyncast=True
219            )
220            return
221
222        # Special case of type matchers:
223        #   AstTypeMatcher<ArgumentType> name
224        m = re.match(
225            r""".*AstTypeMatcher\s*<
226                       \s*([^\s>]+)\s*>
227                       \s*([^\s;]+)\s*;\s*$""",
228            declaration,
229            flags=re.X,
230        )
231        if m:
232            inner, name = m.groups()
233            add_matcher(
234                "Type", name, "Matcher<%s>..." % inner, comment, is_dyncast=True
235            )
236            # FIXME: re-enable once we have implemented casting on the TypeLoc
237            # hierarchy.
238            # add_matcher('TypeLoc', '%sLoc' % name, 'Matcher<%sLoc>...' % inner,
239            #             comment, is_dyncast=True)
240            return
241
242        # Parse the various matcher definition macros.
243        m = re.match(
244            """.*AST_TYPE(LOC)?_TRAVERSE_MATCHER(?:_DECL)?\(
245                       \s*([^\s,]+\s*),
246                       \s*(?:[^\s,]+\s*),
247                       \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\)
248                     \)\s*;\s*$""",
249            declaration,
250            flags=re.X,
251        )
252        if m:
253            loc, name, results = m.groups()[0:3]
254            result_types = [r.strip() for r in results.split(",")]
255
256            comment_result_types = extract_result_types(comment)
257            if comment_result_types and sorted(result_types) != sorted(
258                comment_result_types
259            ):
260                raise Exception("Inconsistent documentation for: %s" % name)
261            for result_type in result_types:
262                add_matcher(result_type, name, "Matcher<Type>", comment)
263                # if loc:
264                #   add_matcher('%sLoc' % result_type, '%sLoc' % name, 'Matcher<TypeLoc>',
265                #               comment)
266            return
267
268        m = re.match(
269            r"""^\s*AST_POLYMORPHIC_MATCHER(_P)?(.?)(?:_OVERLOAD)?\(
270                          \s*([^\s,]+)\s*,
271                          \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\)
272                       (?:,\s*([^\s,]+)\s*
273                          ,\s*([^\s,]+)\s*)?
274                       (?:,\s*([^\s,]+)\s*
275                          ,\s*([^\s,]+)\s*)?
276                       (?:,\s*\d+\s*)?
277                      \)\s*{\s*$""",
278            declaration,
279            flags=re.X,
280        )
281
282        if m:
283            p, n, name, results = m.groups()[0:4]
284            args = m.groups()[4:]
285            result_types = [r.strip() for r in results.split(",")]
286            if allowed_types and allowed_types != result_types:
287                raise Exception("Inconsistent documentation for: %s" % name)
288            if n not in ["", "2"]:
289                raise Exception('Cannot parse "%s"' % declaration)
290            args = ", ".join(
291                "%s %s" % (args[i], args[i + 1])
292                for i in range(0, len(args), 2)
293                if args[i]
294            )
295            for result_type in result_types:
296                add_matcher(result_type, name, args, comment)
297            return
298
299        m = re.match(
300            r"""^\s*AST_POLYMORPHIC_MATCHER_REGEX(?:_OVERLOAD)?\(
301                          \s*([^\s,]+)\s*,
302                          \s*AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\),
303                          \s*([^\s,]+)\s*
304                       (?:,\s*\d+\s*)?
305                      \)\s*{\s*$""",
306            declaration,
307            flags=re.X,
308        )
309
310        if m:
311            name, results, arg_name = m.groups()[0:3]
312            result_types = [r.strip() for r in results.split(",")]
313            if allowed_types and allowed_types != result_types:
314                raise Exception("Inconsistent documentation for: %s" % name)
315            arg = "StringRef %s, Regex::RegexFlags Flags = NoFlags" % arg_name
316            comment += """
317If the matcher is used in clang-query, RegexFlags parameter
318should be passed as a quoted string. e.g: "NoFlags".
319Flags can be combined with '|' example \"IgnoreCase | BasicRegex\"
320"""
321            for result_type in result_types:
322                add_matcher(result_type, name, arg, comment)
323            return
324
325        m = re.match(
326            r"""^\s*AST_MATCHER_FUNCTION(_P)?(.?)(?:_OVERLOAD)?\(
327                       (?:\s*([^\s,]+)\s*,)?
328                          \s*([^\s,]+)\s*
329                       (?:,\s*([^\s,]+)\s*
330                          ,\s*([^\s,]+)\s*)?
331                       (?:,\s*([^\s,]+)\s*
332                          ,\s*([^\s,]+)\s*)?
333                       (?:,\s*\d+\s*)?
334                      \)\s*{\s*$""",
335            declaration,
336            flags=re.X,
337        )
338        if m:
339            p, n, result, name = m.groups()[0:4]
340            args = m.groups()[4:]
341            if n not in ["", "2"]:
342                raise Exception('Cannot parse "%s"' % declaration)
343            args = ", ".join(
344                "%s %s" % (args[i], args[i + 1])
345                for i in range(0, len(args), 2)
346                if args[i]
347            )
348            add_matcher(result, name, args, comment)
349            return
350
351        m = re.match(
352            r"""^\s*AST_MATCHER(_P)?(.?)(?:_OVERLOAD)?\(
353                       (?:\s*([^\s,]+)\s*,)?
354                          \s*([^\s,]+)\s*
355                       (?:,\s*([^,]+)\s*
356                          ,\s*([^\s,]+)\s*)?
357                       (?:,\s*([^\s,]+)\s*
358                          ,\s*([^\s,]+)\s*)?
359                       (?:,\s*\d+\s*)?
360                      \)\s*{""",
361            declaration,
362            flags=re.X,
363        )
364        if m:
365            p, n, result, name = m.groups()[0:4]
366            args = m.groups()[4:]
367            if not result:
368                if not allowed_types:
369                    raise Exception("Did not find allowed result types for: %s" % name)
370                result_types = allowed_types
371            else:
372                result_types = [result]
373            if n not in ["", "2"]:
374                raise Exception('Cannot parse "%s"' % declaration)
375            args = ", ".join(
376                "%s %s" % (args[i], args[i + 1])
377                for i in range(0, len(args), 2)
378                if args[i]
379            )
380            for result_type in result_types:
381                add_matcher(result_type, name, args, comment)
382            return
383
384        m = re.match(
385            r"""^\s*AST_MATCHER_REGEX(?:_OVERLOAD)?\(
386                       \s*([^\s,]+)\s*,
387                       \s*([^\s,]+)\s*,
388                       \s*([^\s,]+)\s*
389                       (?:,\s*\d+\s*)?
390                      \)\s*{""",
391            declaration,
392            flags=re.X,
393        )
394        if m:
395            result, name, arg_name = m.groups()[0:3]
396            if not result:
397                if not allowed_types:
398                    raise Exception("Did not find allowed result types for: %s" % name)
399                result_types = allowed_types
400            else:
401                result_types = [result]
402            arg = "StringRef %s, Regex::RegexFlags Flags = NoFlags" % arg_name
403            comment += """
404If the matcher is used in clang-query, RegexFlags parameter
405should be passed as a quoted string. e.g: "NoFlags".
406Flags can be combined with '|' example \"IgnoreCase | BasicRegex\"
407"""
408
409            for result_type in result_types:
410                add_matcher(result_type, name, arg, comment)
411            return
412
413        # Parse ArgumentAdapting matchers.
414        m = re.match(
415            r"""^.*ArgumentAdaptingMatcherFunc<.*>\s*
416              ([a-zA-Z]*);$""",
417            declaration,
418            flags=re.X,
419        )
420        if m:
421            name = m.groups()[0]
422            add_matcher("*", name, "Matcher<*>", comment)
423            return
424
425        # Parse Variadic functions.
426        m = re.match(
427            r"""^.*internal::VariadicFunction\s*<\s*([^,]+),\s*([^,]+),\s*[^>]+>\s*
428              ([a-zA-Z]*);$""",
429            declaration,
430            flags=re.X,
431        )
432        if m:
433            result, arg, name = m.groups()[:3]
434            add_matcher(result, name, "%s, ..., %s" % (arg, arg), comment)
435            return
436
437        m = re.match(
438            r"""^.*internal::VariadicFunction\s*<\s*
439              internal::PolymorphicMatcher<[\S\s]+
440              AST_POLYMORPHIC_SUPPORTED_TYPES\(([^)]*)\),\s*(.*);$""",
441            declaration,
442            flags=re.X,
443        )
444
445        if m:
446            results, trailing = m.groups()
447            trailing, name = trailing.rsplit(">", 1)
448            name = name.strip()
449            trailing, _ = trailing.rsplit(",", 1)
450            _, arg = trailing.rsplit(",", 1)
451            arg = arg.strip()
452
453            result_types = [r.strip() for r in results.split(",")]
454            for result_type in result_types:
455                add_matcher(result_type, name, "%s, ..., %s" % (arg, arg), comment)
456            return
457
458        # Parse Variadic operator matchers.
459        m = re.match(
460            r"""^.*VariadicOperatorMatcherFunc\s*<\s*([^,]+),\s*([^\s]+)\s*>\s*
461              ([a-zA-Z]*);$""",
462            declaration,
463            flags=re.X,
464        )
465        if m:
466            min_args, max_args, name = m.groups()[:3]
467            if max_args == "1":
468                add_matcher("*", name, "Matcher<*>", comment)
469                return
470            elif max_args == "std::numeric_limits<unsigned>::max()":
471                add_matcher("*", name, "Matcher<*>, ..., Matcher<*>", comment)
472                return
473
474        m = re.match(
475            r"""^.*MapAnyOfMatcher<.*>\s*
476              ([a-zA-Z]*);$""",
477            declaration,
478            flags=re.X,
479        )
480        if m:
481            name = m.groups()[0]
482            add_matcher("*", name, "Matcher<*>...Matcher<*>", comment)
483            return
484
485        # Parse free standing matcher functions, like:
486        #   Matcher<ResultType> Name(Matcher<ArgumentType> InnerMatcher) {
487        m = re.match(
488            r"""^\s*(?:template\s+<\s*(?:class|typename)\s+(.+)\s*>\s+)?
489                     (.*)\s+
490                     ([^\s\(]+)\s*\(
491                     (.*)
492                     \)\s*{""",
493            declaration,
494            re.X,
495        )
496        if m:
497            template_name, result, name, args = m.groups()
498            if template_name:
499                matcherTemplateArgs = re.findall(
500                    r"Matcher<\s*(%s)\s*>" % template_name, args
501                )
502                templateArgs = re.findall(
503                    r"(?:^|[\s,<])(%s)(?:$|[\s,>])" % template_name, args
504                )
505                if len(matcherTemplateArgs) < len(templateArgs):
506                    # The template name is used naked, so don't replace with `*`` later on
507                    template_name = None
508                else:
509                    args = re.sub(
510                        r"(^|[\s,<])%s($|[\s,>])" % template_name, r"\1*\2", args
511                    )
512            args = ", ".join(p.strip() for p in args.split(","))
513            m = re.match(r"(?:^|.*\s+)internal::(?:Bindable)?Matcher<([^>]+)>$", result)
514            if m:
515                result_types = [m.group(1)]
516                if (
517                    template_name
518                    and len(result_types) == 1
519                    and result_types[0] == template_name
520                ):
521                    result_types = ["*"]
522            else:
523                result_types = extract_result_types(comment)
524            if not result_types:
525                if not comment:
526                    # Only overloads don't have their own doxygen comments; ignore those.
527                    print('Ignoring "%s"' % name)
528                else:
529                    print('Cannot determine result type for "%s"' % name)
530            else:
531                for result_type in result_types:
532                    add_matcher(result_type, name, args, comment)
533        else:
534            print('*** Unparsable: "' + declaration + '" ***')
535
536
537def sort_table(matcher_type, matcher_map):
538    """Returns the sorted html table for the given row map."""
539    table = ""
540    for key in sorted(matcher_map.keys()):
541        table += matcher_map[key] + "\n"
542    return (
543        "<!-- START_%(type)s_MATCHERS -->\n"
544        + "%(table)s"
545        + "<!--END_%(type)s_MATCHERS -->"
546    ) % {
547        "type": matcher_type,
548        "table": table,
549    }
550
551
552# Parse the ast matchers.
553# We alternate between two modes:
554# body = True: We parse the definition of a matcher. We need
555#   to parse the full definition before adding a matcher, as the
556#   definition might contain static asserts that specify the result
557#   type.
558# body = False: We parse the comments and declaration of the matcher.
559comment = ""
560declaration = ""
561allowed_types = []
562body = False
563for line in open(MATCHERS_FILE).read().splitlines():
564    if body:
565        if line.strip() and line[0] == "}":
566            if declaration:
567                act_on_decl(declaration, comment, allowed_types)
568                comment = ""
569                declaration = ""
570                allowed_types = []
571            body = False
572        else:
573            m = re.search(r"is_base_of<([^,]+), NodeType>", line)
574            if m and m.group(1):
575                allowed_types += [m.group(1)]
576        continue
577    if line.strip() and line.lstrip()[0] == "/":
578        comment += re.sub(r"^/+\s?", "", line) + "\n"
579    else:
580        declaration += " " + line
581        if (
582            (not line.strip())
583            or line.rstrip()[-1] == ";"
584            or (line.rstrip()[-1] == "{" and line.rstrip()[-3:] != "= {")
585        ):
586            if line.strip() and line.rstrip()[-1] == "{":
587                body = True
588            else:
589                act_on_decl(declaration, comment, allowed_types)
590                comment = ""
591                declaration = ""
592                allowed_types = []
593
594node_matcher_table = sort_table("DECL", node_matchers)
595narrowing_matcher_table = sort_table("NARROWING", narrowing_matchers)
596traversal_matcher_table = sort_table("TRAVERSAL", traversal_matchers)
597
598reference = open(HTML_FILE).read()
599reference = re.sub(
600    r"<!-- START_DECL_MATCHERS.*END_DECL_MATCHERS -->",
601    node_matcher_table,
602    reference,
603    flags=re.S,
604)
605reference = re.sub(
606    r"<!-- START_NARROWING_MATCHERS.*END_NARROWING_MATCHERS -->",
607    narrowing_matcher_table,
608    reference,
609    flags=re.S,
610)
611reference = re.sub(
612    r"<!-- START_TRAVERSAL_MATCHERS.*END_TRAVERSAL_MATCHERS -->",
613    traversal_matcher_table,
614    reference,
615    flags=re.S,
616)
617
618with open("../LibASTMatchersReference.html", "w", newline="\n") as output:
619    output.write(reference)
620