xref: /llvm-project/clang/tools/include-mapping/cppreference_parser.py (revision f385542f62fa1f57001c95c476165e1618cb54ba)
1#!/usr/bin/env python3
2# ===- cppreference_parser.py -  ------------------------------*- python -*--===#
3#
4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5# See https://llvm.org/LICENSE.txt for license information.
6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7#
8# ===------------------------------------------------------------------------===#
9
10from bs4 import BeautifulSoup, NavigableString, Tag
11
12import collections
13import multiprocessing
14import os
15import re
16import signal
17import sys
18
19
20class Symbol:
21    def __init__(self, name, namespace, headers):
22        # unqualifed symbol name, e.g. "move"
23        self.name = name
24        # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
25        # None for C symbols.
26        self.namespace = namespace
27        # a list of corresponding headers
28        self.headers = headers
29
30    def __lt__(self, other):
31        if self.namespace != other.namespace:
32            return str(self.namespace) < str(other.namespace)
33        return self.name < other.name
34
35
36def _HasClass(tag, *classes):
37    for c in tag.get("class", []):
38        if c in classes:
39            return True
40    return False
41
42
43def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name):
44    """Parse symbol page and retrieve the include header defined in this page.
45    The symbol page provides header for the symbol, specifically in
46    "Defined in header <header>" section. An example:
47
48    <tr class="t-dsc-header">
49      <td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
50    </td></tr>
51
52    Returns a list of headers.
53    """
54    headers = set()
55    all_headers = set()
56
57    soup = BeautifulSoup(symbol_page_html, "html.parser")
58    # Rows in table are like:
59    #   Defined in header <foo>      .t-dsc-header
60    #   Defined in header <bar>      .t-dsc-header
61    #   decl1                        .t-dcl
62    #   Defined in header <baz>      .t-dsc-header
63    #   decl2                        .t-dcl
64    for table in soup.select("table.t-dcl-begin, table.t-dsc-begin"):
65        current_headers = []
66        was_decl = False
67        for row in table.select("tr"):
68            if _HasClass(row, "t-dcl", "t-dsc"):
69                was_decl = True
70                # Symbols are in the first cell.
71                found_symbols = row.find("td").stripped_strings
72                if not any(
73                    sym == symbol_name or sym == qual_name for sym in found_symbols
74                ):
75                    continue
76                headers.update(current_headers)
77            elif _HasClass(row, "t-dsc-header"):
78                # If we saw a decl since the last header, this is a new block of headers
79                # for a new block of decls.
80                if was_decl:
81                    current_headers = []
82                was_decl = False
83                # There are also .t-dsc-header for "defined in namespace".
84                if not "Defined in header " in row.text:
85                    continue
86                # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
87                for header_code in row.find_all("code"):
88                    current_headers.append(header_code.text)
89                    all_headers.add(header_code.text)
90    # If the symbol was never named, consider all named headers.
91    return headers or all_headers
92
93
94def _ParseSymbolVariant(caption):
95    if not (isinstance(caption, NavigableString) and "(" in caption):
96        return None
97
98    if ")" in caption.text:  # (locale), (algorithm), etc.
99        return caption.text.strip(" ()")
100
101    second_part = caption.next_sibling
102    if isinstance(second_part, Tag) and second_part.name == "code":
103        # (<code>std::complex</code>), etc.
104        third_part = second_part.next_sibling
105        if isinstance(third_part, NavigableString) and third_part.text.startswith(")"):
106            return second_part.text
107    return None
108
109
110def _ParseIndexPage(index_page_html):
111    """Parse index page.
112    The index page lists all std symbols and hrefs to their detailed pages
113    (which contain the defined header). An example:
114
115    <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
116    <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
117
118    Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
119    """
120    symbols = []
121    soup = BeautifulSoup(index_page_html, "html.parser")
122    for symbol_href in soup.select("a[title]"):
123        # Ignore annotated symbols like "acos<>() (std::complex)".
124        # These tend to be overloads, and we the primary is more useful.
125        # This accidentally accepts begin/end despite the (iterator) caption: the
126        # (since C++11) note is first. They are good symbols, so the bug is unfixed.
127        caption = symbol_href.next_sibling
128        variant = _ParseSymbolVariant(caption)
129        symbol_tt = symbol_href.find("tt")
130        if symbol_tt:
131            symbols.append(
132                (
133                    symbol_tt.text.rstrip("<>()"),  # strip any trailing <>()
134                    symbol_href["href"],
135                    variant,
136                )
137            )
138    return symbols
139
140
141def _ReadSymbolPage(path, name, qual_name):
142    with open(path, encoding="utf-8") as f:
143        return _ParseSymbolPage(f.read(), name, qual_name)
144
145
146def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
147    """Get all symbols listed in the index page. All symbols should be in the
148    given namespace.
149
150    Returns a list of Symbols.
151    """
152
153    # Workflow steps:
154    #   1. Parse index page which lists all symbols to get symbol
155    #      name (unqualified name) and its href link to the symbol page which
156    #      contains the defined header.
157    #   2. Parse the symbol page to get the defined header.
158    index_page_path = os.path.join(root_dir, index_page_name)
159    with open(index_page_path, "r", encoding="utf-8") as f:
160        # Read each symbol page in parallel.
161        results = []  # (symbol_name, promise of [header...])
162        for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
163            # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
164            # FIXME: use these as a fallback rather than ignoring entirely.
165            qualified_symbol_name = (namespace or "") + symbol_name
166            variants_for_symbol = variants_to_accept.get(qualified_symbol_name, ())
167            if variant and variant not in variants_for_symbol:
168                continue
169            path = os.path.join(root_dir, symbol_page_path)
170            if os.path.isfile(path):
171                results.append(
172                    (
173                        symbol_name,
174                        pool.apply_async(
175                            _ReadSymbolPage, (path, symbol_name, qualified_symbol_name)
176                        ),
177                    )
178                )
179            else:
180                sys.stderr.write(
181                    "Discarding information for symbol: %s. Page %s does not exist.\n"
182                    % (symbol_name, path)
183                )
184
185        # Build map from symbol name to a set of headers.
186        symbol_headers = collections.defaultdict(set)
187        for symbol_name, lazy_headers in results:
188            symbol_headers[symbol_name].update(lazy_headers.get())
189
190    symbols = []
191    for name, headers in sorted(symbol_headers.items(), key=lambda t: t[0]):
192        symbols.append(Symbol(name, namespace, list(headers)))
193    return symbols
194
195
196def signal_ignore_initializer():
197    return signal.signal(signal.SIGINT, signal.SIG_IGN)
198
199
200def GetSymbols(parse_pages):
201    """Get all symbols by parsing the given pages.
202
203    Args:
204      parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
205    """
206    # By default we prefer the non-variant versions, as they're more common. But
207    # there are some symbols, whose variant is more common. This list describes
208    # those symbols.
209    variants_to_accept = {
210        # std::remove<> has variant algorithm.
211        "std::remove": ("algorithm"),
212        # These functions don't have a generic version, and all variants are defined in <chrono>
213        "std::chrono::abs": ("std::chrono::duration"),
214        "std::chrono::ceil": ("std::chrono::duration"),
215        "std::chrono::floor": ("std::chrono::duration"),
216        "std::chrono::from_stream": ("std::chrono::day"),
217        "std::chrono::round": ("std::chrono::duration"),
218        # Same, but in <filesystem>
219        "std::filesystem::begin": ("std::filesystem::directory_iterator"),
220        "std::filesystem::end": ("std::filesystem::directory_iterator"),
221        "std::ranges::get": ("std::ranges::subrange"),
222    }
223    symbols = []
224    # Run many workers to process individual symbol pages under the symbol index.
225    # Don't allow workers to capture Ctrl-C.
226    pool = multiprocessing.Pool(initializer=signal_ignore_initializer)
227    try:
228        for root_dir, page_name, namespace in parse_pages:
229            symbols.extend(
230                _GetSymbols(pool, root_dir, page_name, namespace, variants_to_accept)
231            )
232    finally:
233        pool.terminate()
234        pool.join()
235    return sorted(symbols)
236