17213ae84SCassie Jones#!/usr/bin/env python3 246a6f5aeSKirill Bobyrev# ===- cppreference_parser.py - ------------------------------*- python -*--===# 346a6f5aeSKirill Bobyrev# 446a6f5aeSKirill Bobyrev# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 546a6f5aeSKirill Bobyrev# See https://llvm.org/LICENSE.txt for license information. 646a6f5aeSKirill Bobyrev# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 746a6f5aeSKirill Bobyrev# 846a6f5aeSKirill Bobyrev# ===------------------------------------------------------------------------===# 946a6f5aeSKirill Bobyrev 10c0ce44e8SVadim D.from bs4 import BeautifulSoup, NavigableString, Tag 1146a6f5aeSKirill Bobyrev 1246a6f5aeSKirill Bobyrevimport collections 1346a6f5aeSKirill Bobyrevimport multiprocessing 1446a6f5aeSKirill Bobyrevimport os 1546a6f5aeSKirill Bobyrevimport re 1646a6f5aeSKirill Bobyrevimport signal 1746a6f5aeSKirill Bobyrevimport sys 1846a6f5aeSKirill Bobyrev 1946a6f5aeSKirill Bobyrev 2046a6f5aeSKirill Bobyrevclass Symbol: 2146a6f5aeSKirill Bobyrev def __init__(self, name, namespace, headers): 2246a6f5aeSKirill Bobyrev # unqualifed symbol name, e.g. "move" 2346a6f5aeSKirill Bobyrev self.name = name 2446a6f5aeSKirill Bobyrev # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope) 2546a6f5aeSKirill Bobyrev # None for C symbols. 2646a6f5aeSKirill Bobyrev self.namespace = namespace 2746a6f5aeSKirill Bobyrev # a list of corresponding headers 2846a6f5aeSKirill Bobyrev self.headers = headers 2946a6f5aeSKirill Bobyrev 304da28387SViktoriia Bakalova def __lt__(self, other): 314da28387SViktoriia Bakalova if self.namespace != other.namespace: 32c812ab73SHaojian Wu return str(self.namespace) < str(other.namespace) 334da28387SViktoriia Bakalova return self.name < other.name 344da28387SViktoriia Bakalova 3546a6f5aeSKirill Bobyrev 3646a6f5aeSKirill Bobyrevdef _HasClass(tag, *classes): 37dd3c26a0STobias Hieta for c in tag.get("class", []): 3846a6f5aeSKirill Bobyrev if c in classes: 3946a6f5aeSKirill Bobyrev return True 4046a6f5aeSKirill Bobyrev return False 4146a6f5aeSKirill Bobyrev 4246a6f5aeSKirill Bobyrev 43c0ce44e8SVadim D.def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name): 4446a6f5aeSKirill Bobyrev """Parse symbol page and retrieve the include header defined in this page. 4546a6f5aeSKirill Bobyrev The symbol page provides header for the symbol, specifically in 4646a6f5aeSKirill Bobyrev "Defined in header <header>" section. An example: 4746a6f5aeSKirill Bobyrev 4846a6f5aeSKirill Bobyrev <tr class="t-dsc-header"> 4946a6f5aeSKirill Bobyrev <td colspan="2"> <div>Defined in header <code><ratio></code> </div> 5046a6f5aeSKirill Bobyrev </td></tr> 5146a6f5aeSKirill Bobyrev 5246a6f5aeSKirill Bobyrev Returns a list of headers. 5346a6f5aeSKirill Bobyrev """ 5446a6f5aeSKirill Bobyrev headers = set() 5546a6f5aeSKirill Bobyrev all_headers = set() 5646a6f5aeSKirill Bobyrev 5746a6f5aeSKirill Bobyrev soup = BeautifulSoup(symbol_page_html, "html.parser") 5846a6f5aeSKirill Bobyrev # Rows in table are like: 5946a6f5aeSKirill Bobyrev # Defined in header <foo> .t-dsc-header 6046a6f5aeSKirill Bobyrev # Defined in header <bar> .t-dsc-header 6146a6f5aeSKirill Bobyrev # decl1 .t-dcl 6246a6f5aeSKirill Bobyrev # Defined in header <baz> .t-dsc-header 6346a6f5aeSKirill Bobyrev # decl2 .t-dcl 64dd3c26a0STobias Hieta for table in soup.select("table.t-dcl-begin, table.t-dsc-begin"): 6546a6f5aeSKirill Bobyrev current_headers = [] 6646a6f5aeSKirill Bobyrev was_decl = False 67dd3c26a0STobias Hieta for row in table.select("tr"): 68dd3c26a0STobias Hieta if _HasClass(row, "t-dcl", "t-dsc"): 6946a6f5aeSKirill Bobyrev was_decl = True 7046a6f5aeSKirill Bobyrev # Symbols are in the first cell. 71dd3c26a0STobias Hieta found_symbols = row.find("td").stripped_strings 72c0ce44e8SVadim D. if not any( 73c0ce44e8SVadim D. sym == symbol_name or sym == qual_name for sym in found_symbols 74c0ce44e8SVadim D. ): 7546a6f5aeSKirill Bobyrev continue 7646a6f5aeSKirill Bobyrev headers.update(current_headers) 77dd3c26a0STobias Hieta elif _HasClass(row, "t-dsc-header"): 7846a6f5aeSKirill Bobyrev # If we saw a decl since the last header, this is a new block of headers 7946a6f5aeSKirill Bobyrev # for a new block of decls. 8046a6f5aeSKirill Bobyrev if was_decl: 8146a6f5aeSKirill Bobyrev current_headers = [] 8246a6f5aeSKirill Bobyrev was_decl = False 8346a6f5aeSKirill Bobyrev # There are also .t-dsc-header for "defined in namespace". 8446a6f5aeSKirill Bobyrev if not "Defined in header " in row.text: 8546a6f5aeSKirill Bobyrev continue 8646a6f5aeSKirill Bobyrev # The interesting header content (e.g. <cstdlib>) is wrapped in <code>. 8746a6f5aeSKirill Bobyrev for header_code in row.find_all("code"): 8846a6f5aeSKirill Bobyrev current_headers.append(header_code.text) 8946a6f5aeSKirill Bobyrev all_headers.add(header_code.text) 9046a6f5aeSKirill Bobyrev # If the symbol was never named, consider all named headers. 9146a6f5aeSKirill Bobyrev return headers or all_headers 9246a6f5aeSKirill Bobyrev 9346a6f5aeSKirill Bobyrev 94c0ce44e8SVadim D.def _ParseSymbolVariant(caption): 95c0ce44e8SVadim D. if not (isinstance(caption, NavigableString) and "(" in caption): 96c0ce44e8SVadim D. return None 97c0ce44e8SVadim D. 98c0ce44e8SVadim D. if ")" in caption.text: # (locale), (algorithm), etc. 99c0ce44e8SVadim D. return caption.text.strip(" ()") 100c0ce44e8SVadim D. 101c0ce44e8SVadim D. second_part = caption.next_sibling 102c0ce44e8SVadim D. if isinstance(second_part, Tag) and second_part.name == "code": 103c0ce44e8SVadim D. # (<code>std::complex</code>), etc. 104c0ce44e8SVadim D. third_part = second_part.next_sibling 105c0ce44e8SVadim D. if isinstance(third_part, NavigableString) and third_part.text.startswith(")"): 106c0ce44e8SVadim D. return second_part.text 107c0ce44e8SVadim D. return None 108c0ce44e8SVadim D. 109c0ce44e8SVadim D. 11046a6f5aeSKirill Bobyrevdef _ParseIndexPage(index_page_html): 11146a6f5aeSKirill Bobyrev """Parse index page. 11246a6f5aeSKirill Bobyrev The index page lists all std symbols and hrefs to their detailed pages 11346a6f5aeSKirill Bobyrev (which contain the defined header). An example: 11446a6f5aeSKirill Bobyrev 11546a6f5aeSKirill Bobyrev <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> 11646a6f5aeSKirill Bobyrev <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> 11746a6f5aeSKirill Bobyrev 11846a6f5aeSKirill Bobyrev Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). 11946a6f5aeSKirill Bobyrev """ 12046a6f5aeSKirill Bobyrev symbols = [] 12146a6f5aeSKirill Bobyrev soup = BeautifulSoup(index_page_html, "html.parser") 12246a6f5aeSKirill Bobyrev for symbol_href in soup.select("a[title]"): 12346a6f5aeSKirill Bobyrev # Ignore annotated symbols like "acos<>() (std::complex)". 12446a6f5aeSKirill Bobyrev # These tend to be overloads, and we the primary is more useful. 12546a6f5aeSKirill Bobyrev # This accidentally accepts begin/end despite the (iterator) caption: the 12646a6f5aeSKirill Bobyrev # (since C++11) note is first. They are good symbols, so the bug is unfixed. 12746a6f5aeSKirill Bobyrev caption = symbol_href.next_sibling 128c0ce44e8SVadim D. variant = _ParseSymbolVariant(caption) 12946a6f5aeSKirill Bobyrev symbol_tt = symbol_href.find("tt") 13046a6f5aeSKirill Bobyrev if symbol_tt: 131dd3c26a0STobias Hieta symbols.append( 132dd3c26a0STobias Hieta ( 133dd3c26a0STobias Hieta symbol_tt.text.rstrip("<>()"), # strip any trailing <>() 134dd3c26a0STobias Hieta symbol_href["href"], 135dd3c26a0STobias Hieta variant, 136dd3c26a0STobias Hieta ) 137dd3c26a0STobias Hieta ) 13846a6f5aeSKirill Bobyrev return symbols 13946a6f5aeSKirill Bobyrev 14046a6f5aeSKirill Bobyrev 141c0ce44e8SVadim D.def _ReadSymbolPage(path, name, qual_name): 142*f385542fSc8ef with open(path, encoding="utf-8") as f: 143c0ce44e8SVadim D. return _ParseSymbolPage(f.read(), name, qual_name) 14446a6f5aeSKirill Bobyrev 14546a6f5aeSKirill Bobyrev 14646a6f5aeSKirill Bobyrevdef _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept): 14746a6f5aeSKirill Bobyrev """Get all symbols listed in the index page. All symbols should be in the 14846a6f5aeSKirill Bobyrev given namespace. 14946a6f5aeSKirill Bobyrev 15046a6f5aeSKirill Bobyrev Returns a list of Symbols. 15146a6f5aeSKirill Bobyrev """ 15246a6f5aeSKirill Bobyrev 15346a6f5aeSKirill Bobyrev # Workflow steps: 15446a6f5aeSKirill Bobyrev # 1. Parse index page which lists all symbols to get symbol 15546a6f5aeSKirill Bobyrev # name (unqualified name) and its href link to the symbol page which 15646a6f5aeSKirill Bobyrev # contains the defined header. 15746a6f5aeSKirill Bobyrev # 2. Parse the symbol page to get the defined header. 15846a6f5aeSKirill Bobyrev index_page_path = os.path.join(root_dir, index_page_name) 159*f385542fSc8ef with open(index_page_path, "r", encoding="utf-8") as f: 16046a6f5aeSKirill Bobyrev # Read each symbol page in parallel. 16146a6f5aeSKirill Bobyrev results = [] # (symbol_name, promise of [header...]) 16246a6f5aeSKirill Bobyrev for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()): 16346a6f5aeSKirill Bobyrev # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity. 16446a6f5aeSKirill Bobyrev # FIXME: use these as a fallback rather than ignoring entirely. 165c0ce44e8SVadim D. qualified_symbol_name = (namespace or "") + symbol_name 166c0ce44e8SVadim D. variants_for_symbol = variants_to_accept.get(qualified_symbol_name, ()) 16746a6f5aeSKirill Bobyrev if variant and variant not in variants_for_symbol: 16846a6f5aeSKirill Bobyrev continue 16946a6f5aeSKirill Bobyrev path = os.path.join(root_dir, symbol_page_path) 170301123c7SViktoriia Bakalova if os.path.isfile(path): 171dd3c26a0STobias Hieta results.append( 172dd3c26a0STobias Hieta ( 173dd3c26a0STobias Hieta symbol_name, 174c0ce44e8SVadim D. pool.apply_async( 175c0ce44e8SVadim D. _ReadSymbolPage, (path, symbol_name, qualified_symbol_name) 176c0ce44e8SVadim D. ), 177dd3c26a0STobias Hieta ) 178dd3c26a0STobias Hieta ) 179ed001018SViktoriia Bakalova else: 180dd3c26a0STobias Hieta sys.stderr.write( 181dd3c26a0STobias Hieta "Discarding information for symbol: %s. Page %s does not exist.\n" 182dd3c26a0STobias Hieta % (symbol_name, path) 183dd3c26a0STobias Hieta ) 18446a6f5aeSKirill Bobyrev 18546a6f5aeSKirill Bobyrev # Build map from symbol name to a set of headers. 18646a6f5aeSKirill Bobyrev symbol_headers = collections.defaultdict(set) 18746a6f5aeSKirill Bobyrev for symbol_name, lazy_headers in results: 18846a6f5aeSKirill Bobyrev symbol_headers[symbol_name].update(lazy_headers.get()) 18946a6f5aeSKirill Bobyrev 19046a6f5aeSKirill Bobyrev symbols = [] 19146a6f5aeSKirill Bobyrev for name, headers in sorted(symbol_headers.items(), key=lambda t: t[0]): 19246a6f5aeSKirill Bobyrev symbols.append(Symbol(name, namespace, list(headers))) 19346a6f5aeSKirill Bobyrev return symbols 19446a6f5aeSKirill Bobyrev 19546a6f5aeSKirill Bobyrev 1967213ae84SCassie Jonesdef signal_ignore_initializer(): 1977213ae84SCassie Jones return signal.signal(signal.SIGINT, signal.SIG_IGN) 1987213ae84SCassie Jones 1997213ae84SCassie Jones 20046a6f5aeSKirill Bobyrevdef GetSymbols(parse_pages): 20146a6f5aeSKirill Bobyrev """Get all symbols by parsing the given pages. 20246a6f5aeSKirill Bobyrev 20346a6f5aeSKirill Bobyrev Args: 20446a6f5aeSKirill Bobyrev parse_pages: a list of tuples (page_root_dir, index_page_name, namespace) 20546a6f5aeSKirill Bobyrev """ 20646a6f5aeSKirill Bobyrev # By default we prefer the non-variant versions, as they're more common. But 20746a6f5aeSKirill Bobyrev # there are some symbols, whose variant is more common. This list describes 20846a6f5aeSKirill Bobyrev # those symbols. 20946a6f5aeSKirill Bobyrev variants_to_accept = { 21046a6f5aeSKirill Bobyrev # std::remove<> has variant algorithm. 21146a6f5aeSKirill Bobyrev "std::remove": ("algorithm"), 212c0ce44e8SVadim D. # These functions don't have a generic version, and all variants are defined in <chrono> 213c0ce44e8SVadim D. "std::chrono::abs": ("std::chrono::duration"), 214c0ce44e8SVadim D. "std::chrono::ceil": ("std::chrono::duration"), 215c0ce44e8SVadim D. "std::chrono::floor": ("std::chrono::duration"), 216c0ce44e8SVadim D. "std::chrono::from_stream": ("std::chrono::day"), 217c0ce44e8SVadim D. "std::chrono::round": ("std::chrono::duration"), 218c0ce44e8SVadim D. # Same, but in <filesystem> 219c0ce44e8SVadim D. "std::filesystem::begin": ("std::filesystem::directory_iterator"), 220c0ce44e8SVadim D. "std::filesystem::end": ("std::filesystem::directory_iterator"), 221c0ce44e8SVadim D. "std::ranges::get": ("std::ranges::subrange"), 22246a6f5aeSKirill Bobyrev } 22346a6f5aeSKirill Bobyrev symbols = [] 22446a6f5aeSKirill Bobyrev # Run many workers to process individual symbol pages under the symbol index. 22546a6f5aeSKirill Bobyrev # Don't allow workers to capture Ctrl-C. 2267213ae84SCassie Jones pool = multiprocessing.Pool(initializer=signal_ignore_initializer) 22746a6f5aeSKirill Bobyrev try: 22846a6f5aeSKirill Bobyrev for root_dir, page_name, namespace in parse_pages: 229dd3c26a0STobias Hieta symbols.extend( 230dd3c26a0STobias Hieta _GetSymbols(pool, root_dir, page_name, namespace, variants_to_accept) 231dd3c26a0STobias Hieta ) 23246a6f5aeSKirill Bobyrev finally: 23346a6f5aeSKirill Bobyrev pool.terminate() 23446a6f5aeSKirill Bobyrev pool.join() 2354da28387SViktoriia Bakalova return sorted(symbols) 236