1#!/usr/bin/env python3 2# ===- cppreference_parser.py - ------------------------------*- python -*--===# 3# 4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5# See https://llvm.org/LICENSE.txt for license information. 6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7# 8# ===------------------------------------------------------------------------===# 9 10from bs4 import BeautifulSoup, NavigableString, Tag 11 12import collections 13import multiprocessing 14import os 15import re 16import signal 17import sys 18 19 20class Symbol: 21 def __init__(self, name, namespace, headers): 22 # unqualifed symbol name, e.g. "move" 23 self.name = name 24 # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope) 25 # None for C symbols. 26 self.namespace = namespace 27 # a list of corresponding headers 28 self.headers = headers 29 30 def __lt__(self, other): 31 if self.namespace != other.namespace: 32 return str(self.namespace) < str(other.namespace) 33 return self.name < other.name 34 35 36def _HasClass(tag, *classes): 37 for c in tag.get("class", []): 38 if c in classes: 39 return True 40 return False 41 42 43def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name): 44 """Parse symbol page and retrieve the include header defined in this page. 45 The symbol page provides header for the symbol, specifically in 46 "Defined in header <header>" section. An example: 47 48 <tr class="t-dsc-header"> 49 <td colspan="2"> <div>Defined in header <code><ratio></code> </div> 50 </td></tr> 51 52 Returns a list of headers. 53 """ 54 headers = set() 55 all_headers = set() 56 57 soup = BeautifulSoup(symbol_page_html, "html.parser") 58 # Rows in table are like: 59 # Defined in header <foo> .t-dsc-header 60 # Defined in header <bar> .t-dsc-header 61 # decl1 .t-dcl 62 # Defined in header <baz> .t-dsc-header 63 # decl2 .t-dcl 64 for table in soup.select("table.t-dcl-begin, table.t-dsc-begin"): 65 current_headers = [] 66 was_decl = False 67 for row in table.select("tr"): 68 if _HasClass(row, "t-dcl", "t-dsc"): 69 was_decl = True 70 # Symbols are in the first cell. 71 found_symbols = row.find("td").stripped_strings 72 if not any( 73 sym == symbol_name or sym == qual_name for sym in found_symbols 74 ): 75 continue 76 headers.update(current_headers) 77 elif _HasClass(row, "t-dsc-header"): 78 # If we saw a decl since the last header, this is a new block of headers 79 # for a new block of decls. 80 if was_decl: 81 current_headers = [] 82 was_decl = False 83 # There are also .t-dsc-header for "defined in namespace". 84 if not "Defined in header " in row.text: 85 continue 86 # The interesting header content (e.g. <cstdlib>) is wrapped in <code>. 87 for header_code in row.find_all("code"): 88 current_headers.append(header_code.text) 89 all_headers.add(header_code.text) 90 # If the symbol was never named, consider all named headers. 91 return headers or all_headers 92 93 94def _ParseSymbolVariant(caption): 95 if not (isinstance(caption, NavigableString) and "(" in caption): 96 return None 97 98 if ")" in caption.text: # (locale), (algorithm), etc. 99 return caption.text.strip(" ()") 100 101 second_part = caption.next_sibling 102 if isinstance(second_part, Tag) and second_part.name == "code": 103 # (<code>std::complex</code>), etc. 104 third_part = second_part.next_sibling 105 if isinstance(third_part, NavigableString) and third_part.text.startswith(")"): 106 return second_part.text 107 return None 108 109 110def _ParseIndexPage(index_page_html): 111 """Parse index page. 112 The index page lists all std symbols and hrefs to their detailed pages 113 (which contain the defined header). An example: 114 115 <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> 116 <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> 117 118 Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). 119 """ 120 symbols = [] 121 soup = BeautifulSoup(index_page_html, "html.parser") 122 for symbol_href in soup.select("a[title]"): 123 # Ignore annotated symbols like "acos<>() (std::complex)". 124 # These tend to be overloads, and we the primary is more useful. 125 # This accidentally accepts begin/end despite the (iterator) caption: the 126 # (since C++11) note is first. They are good symbols, so the bug is unfixed. 127 caption = symbol_href.next_sibling 128 variant = _ParseSymbolVariant(caption) 129 symbol_tt = symbol_href.find("tt") 130 if symbol_tt: 131 symbols.append( 132 ( 133 symbol_tt.text.rstrip("<>()"), # strip any trailing <>() 134 symbol_href["href"], 135 variant, 136 ) 137 ) 138 return symbols 139 140 141def _ReadSymbolPage(path, name, qual_name): 142 with open(path, encoding="utf-8") as f: 143 return _ParseSymbolPage(f.read(), name, qual_name) 144 145 146def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept): 147 """Get all symbols listed in the index page. All symbols should be in the 148 given namespace. 149 150 Returns a list of Symbols. 151 """ 152 153 # Workflow steps: 154 # 1. Parse index page which lists all symbols to get symbol 155 # name (unqualified name) and its href link to the symbol page which 156 # contains the defined header. 157 # 2. Parse the symbol page to get the defined header. 158 index_page_path = os.path.join(root_dir, index_page_name) 159 with open(index_page_path, "r", encoding="utf-8") as f: 160 # Read each symbol page in parallel. 161 results = [] # (symbol_name, promise of [header...]) 162 for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()): 163 # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity. 164 # FIXME: use these as a fallback rather than ignoring entirely. 165 qualified_symbol_name = (namespace or "") + symbol_name 166 variants_for_symbol = variants_to_accept.get(qualified_symbol_name, ()) 167 if variant and variant not in variants_for_symbol: 168 continue 169 path = os.path.join(root_dir, symbol_page_path) 170 if os.path.isfile(path): 171 results.append( 172 ( 173 symbol_name, 174 pool.apply_async( 175 _ReadSymbolPage, (path, symbol_name, qualified_symbol_name) 176 ), 177 ) 178 ) 179 else: 180 sys.stderr.write( 181 "Discarding information for symbol: %s. Page %s does not exist.\n" 182 % (symbol_name, path) 183 ) 184 185 # Build map from symbol name to a set of headers. 186 symbol_headers = collections.defaultdict(set) 187 for symbol_name, lazy_headers in results: 188 symbol_headers[symbol_name].update(lazy_headers.get()) 189 190 symbols = [] 191 for name, headers in sorted(symbol_headers.items(), key=lambda t: t[0]): 192 symbols.append(Symbol(name, namespace, list(headers))) 193 return symbols 194 195 196def signal_ignore_initializer(): 197 return signal.signal(signal.SIGINT, signal.SIG_IGN) 198 199 200def GetSymbols(parse_pages): 201 """Get all symbols by parsing the given pages. 202 203 Args: 204 parse_pages: a list of tuples (page_root_dir, index_page_name, namespace) 205 """ 206 # By default we prefer the non-variant versions, as they're more common. But 207 # there are some symbols, whose variant is more common. This list describes 208 # those symbols. 209 variants_to_accept = { 210 # std::remove<> has variant algorithm. 211 "std::remove": ("algorithm"), 212 # These functions don't have a generic version, and all variants are defined in <chrono> 213 "std::chrono::abs": ("std::chrono::duration"), 214 "std::chrono::ceil": ("std::chrono::duration"), 215 "std::chrono::floor": ("std::chrono::duration"), 216 "std::chrono::from_stream": ("std::chrono::day"), 217 "std::chrono::round": ("std::chrono::duration"), 218 # Same, but in <filesystem> 219 "std::filesystem::begin": ("std::filesystem::directory_iterator"), 220 "std::filesystem::end": ("std::filesystem::directory_iterator"), 221 "std::ranges::get": ("std::ranges::subrange"), 222 } 223 symbols = [] 224 # Run many workers to process individual symbol pages under the symbol index. 225 # Don't allow workers to capture Ctrl-C. 226 pool = multiprocessing.Pool(initializer=signal_ignore_initializer) 227 try: 228 for root_dir, page_name, namespace in parse_pages: 229 symbols.extend( 230 _GetSymbols(pool, root_dir, page_name, namespace, variants_to_accept) 231 ) 232 finally: 233 pool.terminate() 234 pool.join() 235 return sorted(symbols) 236