1#!/usr/bin/env python3 2# ===- gen_std.py - ------------------------------------------*- python -*--===# 3# 4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5# See https://llvm.org/LICENSE.txt for license information. 6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7# 8# ===------------------------------------------------------------------------===# 9 10"""gen_std.py is a tool to generate a lookup table (from qualified names to 11include headers) for C/C++ Standard Library symbols by parsing archived HTML 12files from cppreference. 13 14The generated files are located in clang/include/Tooling/Inclusions. 15 16Caveats and FIXMEs: 17 - only symbols directly in "std" namespace are added, we should also add std's 18 subnamespace symbols (e.g. chrono). 19 - symbols with multiple variants or defined in multiple headers aren't added, 20 e.g. std::move, std::swap 21 22Usage: 23 1. Install BeautifulSoup dependency, see instruction: 24 https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-beautiful-soup 25 2. Download cppreference offline HTML files (html_book_20220730.zip in Unofficial Release) at 26 https://en.cppreference.com/w/Cppreference:Archives 27 3. Unzip the zip file from step 2 (e.g., to a "cppreference" directory). You should 28 get a "cppreference/reference" directory. 29 4. Run the command: 30 // Generate C++ symbols 31 python3 gen_std.py -cppreference cppreference/reference -symbols=cpp > StdSymbolMap.inc 32 // Generate C symbols 33 python3 gen_std.py -cppreference cppreference/reference -symbols=c > CSymbolMap.inc 34""" 35 36 37import cppreference_parser 38import argparse 39import datetime 40import os 41import sys 42import re 43 44 45CODE_PREFIX = """\ 46//===-- gen_std.py generated file -------------------------------*- C++ -*-===// 47// 48// Used to build a lookup table (qualified names => include headers) for %s 49// Standard Library symbols. 50// 51// This file was generated automatically by 52// clang/tools/include-mapping/gen_std.py, DO NOT EDIT! 53// 54// Generated from cppreference offline HTML book (modified on %s). 55//===----------------------------------------------------------------------===// 56""" 57 58 59def ParseArg(): 60 parser = argparse.ArgumentParser(description="Generate StdGen file") 61 parser.add_argument( 62 "-cppreference", 63 metavar="PATH", 64 default="", 65 help="path to the cppreference offline HTML directory", 66 required=True, 67 ) 68 parser.add_argument( 69 "-symbols", 70 default="cpp", 71 help="Generate c or cpp (removed) symbols. One of {cpp, c, cpp_removed}.", 72 required=True, 73 ) 74 return parser.parse_args() 75 76 77def AdditionalHeadersForIOSymbols(symbol): 78 # IO-related symbols declared in the <iosfwd> header, per C++ 79 # [iosfwd.syn 31.3.1]: 80 iosfwd_symbols = [ 81 "basic_ios", 82 "basic_streambuf", 83 "basic_istream", 84 "basic_ostream", 85 "basic_iostream", 86 "basic_stringbuf", 87 "basic_istringstream", 88 "basic_ostringstream", 89 "basic_stringstream", 90 "basic_spanbuf", 91 "basic_ispanstream", 92 "basic_ospanstream", 93 "basic_spanstream", 94 "basic_filebuf", 95 "basic_ifstream", 96 "basic_ofstream", 97 "basic_fstream", 98 "basic_syncbuf", 99 "basic_osyncstream", 100 "istreambuf_iterator", 101 "ostreambuf_iterator", 102 "ios", 103 "wios", 104 "streambuf", 105 "istream", 106 "ostream", 107 "iostream", 108 "stringbuf", 109 "istringstream", 110 "ostringstream", 111 "stringstream", 112 "spanbuf", 113 "ispanstream", 114 "ospanstream", 115 "spanstream", 116 "filebuf", 117 "ifstream", 118 "ofstream", 119 "fstream", 120 "syncbuf", 121 "osyncstream", 122 "wstreambuf", 123 "wistream", 124 "wostream", 125 "wiostream", 126 "wstringbuf", 127 "wistringstream", 128 "wostringstream", 129 "wstringstream", 130 "wspanbuf", 131 "wispanstream", 132 "wospanstream", 133 "wspanstream", 134 "wfilebuf", 135 "wifstream", 136 "wofstream", 137 "wfstream", 138 "wsyncbuf", 139 "wosyncstream", 140 "fpos", 141 "streampos", 142 "wstreampos", 143 "u8streampos", 144 "u16streampos", 145 "u32streampos", 146 ] 147 assert len(symbol.headers) == 1 148 sym_header = symbol.headers[0] 149 headers = [] 150 # <iostream> is preferred than <iosfwd> 151 152 # <iostream> is an alternative of <streambuf>, <istream>, <ostream>, <ios>. 153 # per C++ [iostream.syn 31.4.1] 154 if sym_header in ["<ios>", "<istream>", "<ostream>", "<streambuf>"]: 155 headers.append("<iostream>") 156 157 if symbol.name in iosfwd_symbols: 158 headers.append("<iosfwd>") 159 160 return headers 161 162 163def GetCCompatibilitySymbols(symbol): 164 # C++ form of the C standard headers. 165 c_compat_headers = { 166 "<cassert>", 167 "<cctype>", 168 "<cerrno>", 169 "<cfenv>", 170 "<cfloat>", 171 "<cinttypes>", 172 "<climits>", 173 "<clocale>", 174 "<cmath>", 175 "<csetjmp>", 176 "<csignal>", 177 "<cstdarg>", 178 "<cstddef>", 179 "<cstdint>", 180 "<cstdio>", 181 "<cstdlib>", 182 "<cstring>", 183 "<ctime>", 184 "<cuchar>", 185 "<cwchar>", 186 "<cwctype>", 187 } 188 # C++ [support.c.headers.other] 17.14.7 189 # ..., behaves as if each name placed in the standard library namespace by 190 # the corresponding <cname> header is placed within the global namespace 191 # scope, except for the functions described in [sf.cmath], the 192 # std::lerp function overloads ([c.math.lerp]), the declaration of 193 # std::byte ([cstddef.syn]), and the functions and function templates 194 # described in [support.types.byteops]. 195 exception_symbols = { 196 "(assoc_)?laguerre[f|l]?", 197 "(assoc_|sph_)?legendre[f|l]?", 198 "beta[f|l]?", 199 "(comp_)?ellint_[1-3][f|l]?", 200 "(cyl_|sph_)?bessel_[i-k][f|l]?", 201 "(cyl_|sph_)?neumann[f|l]?", 202 "expint[f|l]?", 203 "hermite[f|l]?", 204 "riemann_zeta[f|l]?", 205 "lerp", 206 "byte", 207 } 208 assert len(symbol.headers) == 1 209 header = symbol.headers[0] 210 if header not in c_compat_headers: 211 return [] 212 if any(re.fullmatch(x, symbol.name) for x in exception_symbols): 213 return [] 214 215 # Introduce two more entries, both in the global namespace, one using the 216 # C++-compat header and another using the C header. 217 results = [] 218 if symbol.namespace is not None: 219 # avoid printing duplicated entries, for C macros! 220 results.append(cppreference_parser.Symbol(symbol.name, None, [header])) 221 c_header = "<" + header[2:-1] + ".h>" # <cstdio> => <stdio.h> 222 results.append(cppreference_parser.Symbol(symbol.name, None, [c_header])) 223 return results 224 225 226def main(): 227 args = ParseArg() 228 if args.symbols == "cpp": 229 page_root = os.path.join(args.cppreference, "en", "cpp") 230 symbol_index_root = os.path.join(page_root, "symbol_index") 231 parse_pages = [ 232 (page_root, "symbol_index.html", "std::"), 233 # std sub-namespace symbols have separated pages. 234 # We don't index std literal operators (e.g. 235 # std::literals::chrono_literals::operator""d), these symbols can't be 236 # accessed by std::<symbol_name>. 237 # 238 # std::placeholders symbols are handled manually in StdSpecialSymbolMap.inc 239 (symbol_index_root, "chrono.html", "std::chrono::"), 240 (symbol_index_root, "execution.html", "std::execution::"), 241 (symbol_index_root, "numbers.html", "std::numbers::"), 242 (symbol_index_root, "filesystem.html", "std::filesystem::"), 243 (symbol_index_root, "pmr.html", "std::pmr::"), 244 (symbol_index_root, "ranges.html", "std::ranges::"), 245 246 (symbol_index_root, "views.html", "std::ranges::views::"), 247 # std::ranges::views can be accessed as std::views. 248 (symbol_index_root, "views.html", "std::views::"), 249 250 (symbol_index_root, "regex_constants.html", "std::regex_constants::"), 251 (symbol_index_root, "this_thread.html", "std::this_thread::"), 252 # Zombie symbols that were available from the Standard Library, but are 253 # removed in the following standards. 254 (symbol_index_root, "zombie_names.html", "std::"), 255 (symbol_index_root, "macro.html", None), 256 ] 257 elif args.symbols == "c": 258 page_root = os.path.join(args.cppreference, "en", "c") 259 symbol_index_root = page_root 260 parse_pages = [(page_root, "index.html", None)] 261 262 if not os.path.exists(symbol_index_root): 263 exit("Path %s doesn't exist!" % symbol_index_root) 264 265 symbols = cppreference_parser.GetSymbols(parse_pages) 266 267 # We don't have version information from the unzipped offline HTML files. 268 # so we use the modified time of the symbol_index.html as the version. 269 index_page_path = os.path.join(page_root, "index.html") 270 cppreference_modified_date = datetime.datetime.fromtimestamp( 271 os.stat(index_page_path).st_mtime 272 ).strftime("%Y-%m-%d") 273 print(CODE_PREFIX % (args.symbols.upper(), cppreference_modified_date)) 274 for symbol in symbols: 275 if len(symbol.headers) == 1: 276 augmented_symbols = [symbol] 277 augmented_symbols.extend(GetCCompatibilitySymbols(symbol)) 278 for s in augmented_symbols: 279 s.headers.extend(AdditionalHeadersForIOSymbols(s)) 280 for header in s.headers: 281 # SYMBOL(unqualified_name, namespace, header) 282 print("SYMBOL(%s, %s, %s)" % (s.name, s.namespace, header)) 283 elif len(symbol.headers) == 0: 284 sys.stderr.write("No header found for symbol %s\n" % symbol.name) 285 else: 286 # FIXME: support symbols with multiple headers (e.g. std::move). 287 sys.stderr.write( 288 "Ambiguous header for symbol %s: %s\n" 289 % (symbol.name, ", ".join(symbol.headers)) 290 ) 291 292 293if __name__ == "__main__": 294 main() 295