xref: /llvm-project/clang/tools/include-mapping/cppreference_parser.py (revision f385542f62fa1f57001c95c476165e1618cb54ba)
17213ae84SCassie Jones#!/usr/bin/env python3
246a6f5aeSKirill Bobyrev# ===- cppreference_parser.py -  ------------------------------*- python -*--===#
346a6f5aeSKirill Bobyrev#
446a6f5aeSKirill Bobyrev# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
546a6f5aeSKirill Bobyrev# See https://llvm.org/LICENSE.txt for license information.
646a6f5aeSKirill Bobyrev# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
746a6f5aeSKirill Bobyrev#
846a6f5aeSKirill Bobyrev# ===------------------------------------------------------------------------===#
946a6f5aeSKirill Bobyrev
10c0ce44e8SVadim D.from bs4 import BeautifulSoup, NavigableString, Tag
1146a6f5aeSKirill Bobyrev
1246a6f5aeSKirill Bobyrevimport collections
1346a6f5aeSKirill Bobyrevimport multiprocessing
1446a6f5aeSKirill Bobyrevimport os
1546a6f5aeSKirill Bobyrevimport re
1646a6f5aeSKirill Bobyrevimport signal
1746a6f5aeSKirill Bobyrevimport sys
1846a6f5aeSKirill Bobyrev
1946a6f5aeSKirill Bobyrev
2046a6f5aeSKirill Bobyrevclass Symbol:
2146a6f5aeSKirill Bobyrev    def __init__(self, name, namespace, headers):
2246a6f5aeSKirill Bobyrev        # unqualifed symbol name, e.g. "move"
2346a6f5aeSKirill Bobyrev        self.name = name
2446a6f5aeSKirill Bobyrev        # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
2546a6f5aeSKirill Bobyrev        # None for C symbols.
2646a6f5aeSKirill Bobyrev        self.namespace = namespace
2746a6f5aeSKirill Bobyrev        # a list of corresponding headers
2846a6f5aeSKirill Bobyrev        self.headers = headers
2946a6f5aeSKirill Bobyrev
304da28387SViktoriia Bakalova    def __lt__(self, other):
314da28387SViktoriia Bakalova        if self.namespace != other.namespace:
32c812ab73SHaojian Wu            return str(self.namespace) < str(other.namespace)
334da28387SViktoriia Bakalova        return self.name < other.name
344da28387SViktoriia Bakalova
3546a6f5aeSKirill Bobyrev
3646a6f5aeSKirill Bobyrevdef _HasClass(tag, *classes):
37dd3c26a0STobias Hieta    for c in tag.get("class", []):
3846a6f5aeSKirill Bobyrev        if c in classes:
3946a6f5aeSKirill Bobyrev            return True
4046a6f5aeSKirill Bobyrev    return False
4146a6f5aeSKirill Bobyrev
4246a6f5aeSKirill Bobyrev
43c0ce44e8SVadim D.def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name):
4446a6f5aeSKirill Bobyrev    """Parse symbol page and retrieve the include header defined in this page.
4546a6f5aeSKirill Bobyrev    The symbol page provides header for the symbol, specifically in
4646a6f5aeSKirill Bobyrev    "Defined in header <header>" section. An example:
4746a6f5aeSKirill Bobyrev
4846a6f5aeSKirill Bobyrev    <tr class="t-dsc-header">
4946a6f5aeSKirill Bobyrev      <td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
5046a6f5aeSKirill Bobyrev    </td></tr>
5146a6f5aeSKirill Bobyrev
5246a6f5aeSKirill Bobyrev    Returns a list of headers.
5346a6f5aeSKirill Bobyrev    """
5446a6f5aeSKirill Bobyrev    headers = set()
5546a6f5aeSKirill Bobyrev    all_headers = set()
5646a6f5aeSKirill Bobyrev
5746a6f5aeSKirill Bobyrev    soup = BeautifulSoup(symbol_page_html, "html.parser")
5846a6f5aeSKirill Bobyrev    # Rows in table are like:
5946a6f5aeSKirill Bobyrev    #   Defined in header <foo>      .t-dsc-header
6046a6f5aeSKirill Bobyrev    #   Defined in header <bar>      .t-dsc-header
6146a6f5aeSKirill Bobyrev    #   decl1                        .t-dcl
6246a6f5aeSKirill Bobyrev    #   Defined in header <baz>      .t-dsc-header
6346a6f5aeSKirill Bobyrev    #   decl2                        .t-dcl
64dd3c26a0STobias Hieta    for table in soup.select("table.t-dcl-begin, table.t-dsc-begin"):
6546a6f5aeSKirill Bobyrev        current_headers = []
6646a6f5aeSKirill Bobyrev        was_decl = False
67dd3c26a0STobias Hieta        for row in table.select("tr"):
68dd3c26a0STobias Hieta            if _HasClass(row, "t-dcl", "t-dsc"):
6946a6f5aeSKirill Bobyrev                was_decl = True
7046a6f5aeSKirill Bobyrev                # Symbols are in the first cell.
71dd3c26a0STobias Hieta                found_symbols = row.find("td").stripped_strings
72c0ce44e8SVadim D.                if not any(
73c0ce44e8SVadim D.                    sym == symbol_name or sym == qual_name for sym in found_symbols
74c0ce44e8SVadim D.                ):
7546a6f5aeSKirill Bobyrev                    continue
7646a6f5aeSKirill Bobyrev                headers.update(current_headers)
77dd3c26a0STobias Hieta            elif _HasClass(row, "t-dsc-header"):
7846a6f5aeSKirill Bobyrev                # If we saw a decl since the last header, this is a new block of headers
7946a6f5aeSKirill Bobyrev                # for a new block of decls.
8046a6f5aeSKirill Bobyrev                if was_decl:
8146a6f5aeSKirill Bobyrev                    current_headers = []
8246a6f5aeSKirill Bobyrev                was_decl = False
8346a6f5aeSKirill Bobyrev                # There are also .t-dsc-header for "defined in namespace".
8446a6f5aeSKirill Bobyrev                if not "Defined in header " in row.text:
8546a6f5aeSKirill Bobyrev                    continue
8646a6f5aeSKirill Bobyrev                # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
8746a6f5aeSKirill Bobyrev                for header_code in row.find_all("code"):
8846a6f5aeSKirill Bobyrev                    current_headers.append(header_code.text)
8946a6f5aeSKirill Bobyrev                    all_headers.add(header_code.text)
9046a6f5aeSKirill Bobyrev    # If the symbol was never named, consider all named headers.
9146a6f5aeSKirill Bobyrev    return headers or all_headers
9246a6f5aeSKirill Bobyrev
9346a6f5aeSKirill Bobyrev
94c0ce44e8SVadim D.def _ParseSymbolVariant(caption):
95c0ce44e8SVadim D.    if not (isinstance(caption, NavigableString) and "(" in caption):
96c0ce44e8SVadim D.        return None
97c0ce44e8SVadim D.
98c0ce44e8SVadim D.    if ")" in caption.text:  # (locale), (algorithm), etc.
99c0ce44e8SVadim D.        return caption.text.strip(" ()")
100c0ce44e8SVadim D.
101c0ce44e8SVadim D.    second_part = caption.next_sibling
102c0ce44e8SVadim D.    if isinstance(second_part, Tag) and second_part.name == "code":
103c0ce44e8SVadim D.        # (<code>std::complex</code>), etc.
104c0ce44e8SVadim D.        third_part = second_part.next_sibling
105c0ce44e8SVadim D.        if isinstance(third_part, NavigableString) and third_part.text.startswith(")"):
106c0ce44e8SVadim D.            return second_part.text
107c0ce44e8SVadim D.    return None
108c0ce44e8SVadim D.
109c0ce44e8SVadim D.
11046a6f5aeSKirill Bobyrevdef _ParseIndexPage(index_page_html):
11146a6f5aeSKirill Bobyrev    """Parse index page.
11246a6f5aeSKirill Bobyrev    The index page lists all std symbols and hrefs to their detailed pages
11346a6f5aeSKirill Bobyrev    (which contain the defined header). An example:
11446a6f5aeSKirill Bobyrev
11546a6f5aeSKirill Bobyrev    <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
11646a6f5aeSKirill Bobyrev    <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
11746a6f5aeSKirill Bobyrev
11846a6f5aeSKirill Bobyrev    Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
11946a6f5aeSKirill Bobyrev    """
12046a6f5aeSKirill Bobyrev    symbols = []
12146a6f5aeSKirill Bobyrev    soup = BeautifulSoup(index_page_html, "html.parser")
12246a6f5aeSKirill Bobyrev    for symbol_href in soup.select("a[title]"):
12346a6f5aeSKirill Bobyrev        # Ignore annotated symbols like "acos<>() (std::complex)".
12446a6f5aeSKirill Bobyrev        # These tend to be overloads, and we the primary is more useful.
12546a6f5aeSKirill Bobyrev        # This accidentally accepts begin/end despite the (iterator) caption: the
12646a6f5aeSKirill Bobyrev        # (since C++11) note is first. They are good symbols, so the bug is unfixed.
12746a6f5aeSKirill Bobyrev        caption = symbol_href.next_sibling
128c0ce44e8SVadim D.        variant = _ParseSymbolVariant(caption)
12946a6f5aeSKirill Bobyrev        symbol_tt = symbol_href.find("tt")
13046a6f5aeSKirill Bobyrev        if symbol_tt:
131dd3c26a0STobias Hieta            symbols.append(
132dd3c26a0STobias Hieta                (
133dd3c26a0STobias Hieta                    symbol_tt.text.rstrip("<>()"),  # strip any trailing <>()
134dd3c26a0STobias Hieta                    symbol_href["href"],
135dd3c26a0STobias Hieta                    variant,
136dd3c26a0STobias Hieta                )
137dd3c26a0STobias Hieta            )
13846a6f5aeSKirill Bobyrev    return symbols
13946a6f5aeSKirill Bobyrev
14046a6f5aeSKirill Bobyrev
141c0ce44e8SVadim D.def _ReadSymbolPage(path, name, qual_name):
142*f385542fSc8ef    with open(path, encoding="utf-8") as f:
143c0ce44e8SVadim D.        return _ParseSymbolPage(f.read(), name, qual_name)
14446a6f5aeSKirill Bobyrev
14546a6f5aeSKirill Bobyrev
14646a6f5aeSKirill Bobyrevdef _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
14746a6f5aeSKirill Bobyrev    """Get all symbols listed in the index page. All symbols should be in the
14846a6f5aeSKirill Bobyrev    given namespace.
14946a6f5aeSKirill Bobyrev
15046a6f5aeSKirill Bobyrev    Returns a list of Symbols.
15146a6f5aeSKirill Bobyrev    """
15246a6f5aeSKirill Bobyrev
15346a6f5aeSKirill Bobyrev    # Workflow steps:
15446a6f5aeSKirill Bobyrev    #   1. Parse index page which lists all symbols to get symbol
15546a6f5aeSKirill Bobyrev    #      name (unqualified name) and its href link to the symbol page which
15646a6f5aeSKirill Bobyrev    #      contains the defined header.
15746a6f5aeSKirill Bobyrev    #   2. Parse the symbol page to get the defined header.
15846a6f5aeSKirill Bobyrev    index_page_path = os.path.join(root_dir, index_page_name)
159*f385542fSc8ef    with open(index_page_path, "r", encoding="utf-8") as f:
16046a6f5aeSKirill Bobyrev        # Read each symbol page in parallel.
16146a6f5aeSKirill Bobyrev        results = []  # (symbol_name, promise of [header...])
16246a6f5aeSKirill Bobyrev        for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
16346a6f5aeSKirill Bobyrev            # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
16446a6f5aeSKirill Bobyrev            # FIXME: use these as a fallback rather than ignoring entirely.
165c0ce44e8SVadim D.            qualified_symbol_name = (namespace or "") + symbol_name
166c0ce44e8SVadim D.            variants_for_symbol = variants_to_accept.get(qualified_symbol_name, ())
16746a6f5aeSKirill Bobyrev            if variant and variant not in variants_for_symbol:
16846a6f5aeSKirill Bobyrev                continue
16946a6f5aeSKirill Bobyrev            path = os.path.join(root_dir, symbol_page_path)
170301123c7SViktoriia Bakalova            if os.path.isfile(path):
171dd3c26a0STobias Hieta                results.append(
172dd3c26a0STobias Hieta                    (
173dd3c26a0STobias Hieta                        symbol_name,
174c0ce44e8SVadim D.                        pool.apply_async(
175c0ce44e8SVadim D.                            _ReadSymbolPage, (path, symbol_name, qualified_symbol_name)
176c0ce44e8SVadim D.                        ),
177dd3c26a0STobias Hieta                    )
178dd3c26a0STobias Hieta                )
179ed001018SViktoriia Bakalova            else:
180dd3c26a0STobias Hieta                sys.stderr.write(
181dd3c26a0STobias Hieta                    "Discarding information for symbol: %s. Page %s does not exist.\n"
182dd3c26a0STobias Hieta                    % (symbol_name, path)
183dd3c26a0STobias Hieta                )
18446a6f5aeSKirill Bobyrev
18546a6f5aeSKirill Bobyrev        # Build map from symbol name to a set of headers.
18646a6f5aeSKirill Bobyrev        symbol_headers = collections.defaultdict(set)
18746a6f5aeSKirill Bobyrev        for symbol_name, lazy_headers in results:
18846a6f5aeSKirill Bobyrev            symbol_headers[symbol_name].update(lazy_headers.get())
18946a6f5aeSKirill Bobyrev
19046a6f5aeSKirill Bobyrev    symbols = []
19146a6f5aeSKirill Bobyrev    for name, headers in sorted(symbol_headers.items(), key=lambda t: t[0]):
19246a6f5aeSKirill Bobyrev        symbols.append(Symbol(name, namespace, list(headers)))
19346a6f5aeSKirill Bobyrev    return symbols
19446a6f5aeSKirill Bobyrev
19546a6f5aeSKirill Bobyrev
1967213ae84SCassie Jonesdef signal_ignore_initializer():
1977213ae84SCassie Jones    return signal.signal(signal.SIGINT, signal.SIG_IGN)
1987213ae84SCassie Jones
1997213ae84SCassie Jones
20046a6f5aeSKirill Bobyrevdef GetSymbols(parse_pages):
20146a6f5aeSKirill Bobyrev    """Get all symbols by parsing the given pages.
20246a6f5aeSKirill Bobyrev
20346a6f5aeSKirill Bobyrev    Args:
20446a6f5aeSKirill Bobyrev      parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
20546a6f5aeSKirill Bobyrev    """
20646a6f5aeSKirill Bobyrev    # By default we prefer the non-variant versions, as they're more common. But
20746a6f5aeSKirill Bobyrev    # there are some symbols, whose variant is more common. This list describes
20846a6f5aeSKirill Bobyrev    # those symbols.
20946a6f5aeSKirill Bobyrev    variants_to_accept = {
21046a6f5aeSKirill Bobyrev        # std::remove<> has variant algorithm.
21146a6f5aeSKirill Bobyrev        "std::remove": ("algorithm"),
212c0ce44e8SVadim D.        # These functions don't have a generic version, and all variants are defined in <chrono>
213c0ce44e8SVadim D.        "std::chrono::abs": ("std::chrono::duration"),
214c0ce44e8SVadim D.        "std::chrono::ceil": ("std::chrono::duration"),
215c0ce44e8SVadim D.        "std::chrono::floor": ("std::chrono::duration"),
216c0ce44e8SVadim D.        "std::chrono::from_stream": ("std::chrono::day"),
217c0ce44e8SVadim D.        "std::chrono::round": ("std::chrono::duration"),
218c0ce44e8SVadim D.        # Same, but in <filesystem>
219c0ce44e8SVadim D.        "std::filesystem::begin": ("std::filesystem::directory_iterator"),
220c0ce44e8SVadim D.        "std::filesystem::end": ("std::filesystem::directory_iterator"),
221c0ce44e8SVadim D.        "std::ranges::get": ("std::ranges::subrange"),
22246a6f5aeSKirill Bobyrev    }
22346a6f5aeSKirill Bobyrev    symbols = []
22446a6f5aeSKirill Bobyrev    # Run many workers to process individual symbol pages under the symbol index.
22546a6f5aeSKirill Bobyrev    # Don't allow workers to capture Ctrl-C.
2267213ae84SCassie Jones    pool = multiprocessing.Pool(initializer=signal_ignore_initializer)
22746a6f5aeSKirill Bobyrev    try:
22846a6f5aeSKirill Bobyrev        for root_dir, page_name, namespace in parse_pages:
229dd3c26a0STobias Hieta            symbols.extend(
230dd3c26a0STobias Hieta                _GetSymbols(pool, root_dir, page_name, namespace, variants_to_accept)
231dd3c26a0STobias Hieta            )
23246a6f5aeSKirill Bobyrev    finally:
23346a6f5aeSKirill Bobyrev        pool.terminate()
23446a6f5aeSKirill Bobyrev        pool.join()
2354da28387SViktoriia Bakalova    return sorted(symbols)
236