1*12c85518Srobert#!/usr/bin/env python 2*12c85518Srobert#===- gen_std.py - ------------------------------------------*- python -*--===# 3*12c85518Srobert# 4*12c85518Srobert# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5*12c85518Srobert# See https://llvm.org/LICENSE.txt for license information. 6*12c85518Srobert# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7*12c85518Srobert# 8*12c85518Srobert#===------------------------------------------------------------------------===# 9*12c85518Srobert 10*12c85518Srobert"""gen_std.py is a tool to generate a lookup table (from qualified names to 11*12c85518Srobertinclude headers) for C/C++ Standard Library symbols by parsing archived HTML 12*12c85518Srobertfiles from cppreference. 13*12c85518Srobert 14*12c85518SrobertThe generated files are located in clang/include/Tooling/Inclusions. 15*12c85518Srobert 16*12c85518SrobertCaveats and FIXMEs: 17*12c85518Srobert - only symbols directly in "std" namespace are added, we should also add std's 18*12c85518Srobert subnamespace symbols (e.g. chrono). 19*12c85518Srobert - symbols with multiple variants or defined in multiple headers aren't added, 20*12c85518Srobert e.g. std::move, std::swap 21*12c85518Srobert 22*12c85518SrobertUsage: 23*12c85518Srobert 1. Install BeautifulSoup dependency, see instruction: 24*12c85518Srobert https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-beautiful-soup 25*12c85518Srobert 2. Download cppreference offline HTML files (e.g. html_book_20181028.zip) at 26*12c85518Srobert https://en.cppreference.com/w/Cppreference:Archives 27*12c85518Srobert 3. Unzip the zip file from step 2 (e.g., to a "cppreference" directory). You should 28*12c85518Srobert get a "cppreference/reference" directory. 29*12c85518Srobert 4. Run the command: 30*12c85518Srobert // Generate C++ symbols 31*12c85518Srobert python3 gen_std.py -cppreference cppreference/reference -symbols=cpp > StdSymbolMap.inc 32*12c85518Srobert // Generate C++ removed symbols 33*12c85518Srobert python3 gen_std.py -cppreference cppreference/reference -symbols=cpp_removed > RemovedSymbolMap.inc 34*12c85518Srobert // Generate C symbols 35*12c85518Srobert python3 gen_std.py -cppreference cppreference/reference -symbols=c > CSymbolMap.inc 36*12c85518Srobert""" 37*12c85518Srobert 38*12c85518Srobert 39*12c85518Srobertimport cppreference_parser 40*12c85518Srobertimport argparse 41*12c85518Srobertimport datetime 42*12c85518Srobertimport os 43*12c85518Srobertimport sys 44*12c85518Srobert 45*12c85518SrobertCODE_PREFIX = """\ 46*12c85518Srobert//===-- gen_std.py generated file -------------------------------*- C++ -*-===// 47*12c85518Srobert// 48*12c85518Srobert// Used to build a lookup table (qualified names => include headers) for %s 49*12c85518Srobert// Standard Library symbols. 50*12c85518Srobert// 51*12c85518Srobert// This file was generated automatically by 52*12c85518Srobert// clang/tools/include-mapping/gen_std.py, DO NOT EDIT! 53*12c85518Srobert// 54*12c85518Srobert// Generated from cppreference offline HTML book (modified on %s). 55*12c85518Srobert//===----------------------------------------------------------------------===// 56*12c85518Srobert""" 57*12c85518Srobert 58*12c85518Srobertdef ParseArg(): 59*12c85518Srobert parser = argparse.ArgumentParser(description='Generate StdGen file') 60*12c85518Srobert parser.add_argument('-cppreference', metavar='PATH', 61*12c85518Srobert default='', 62*12c85518Srobert help='path to the cppreference offline HTML directory', 63*12c85518Srobert required=True 64*12c85518Srobert ) 65*12c85518Srobert parser.add_argument('-symbols', 66*12c85518Srobert default='cpp', 67*12c85518Srobert help='Generate c or cpp (removed) symbols. One of {cpp, c, cpp_removed}.', 68*12c85518Srobert required=True) 69*12c85518Srobert return parser.parse_args() 70*12c85518Srobert 71*12c85518Srobert 72*12c85518Srobertdef main(): 73*12c85518Srobert args = ParseArg() 74*12c85518Srobert if args.symbols == 'cpp': 75*12c85518Srobert page_root = os.path.join(args.cppreference, "en", "cpp") 76*12c85518Srobert symbol_index_root = os.path.join(page_root, "symbol_index") 77*12c85518Srobert parse_pages = [ 78*12c85518Srobert (page_root, "symbol_index.html", "std::"), 79*12c85518Srobert # std sub-namespace symbols have separated pages. 80*12c85518Srobert # We don't index std literal operators (e.g. 81*12c85518Srobert # std::literals::chrono_literals::operator""d), these symbols can't be 82*12c85518Srobert # accessed by std::<symbol_name>. 83*12c85518Srobert # FIXME: index std::placeholders symbols, placeholders.html page is 84*12c85518Srobert # different (which contains one entry for _1, _2, ..., _N), we need special 85*12c85518Srobert # handling. 86*12c85518Srobert (symbol_index_root, "chrono.html", "std::chrono::"), 87*12c85518Srobert (symbol_index_root, "filesystem.html", "std::filesystem::"), 88*12c85518Srobert (symbol_index_root, "pmr.html", "std::pmr::"), 89*12c85518Srobert (symbol_index_root, "regex_constants.html", "std::regex_constants::"), 90*12c85518Srobert (symbol_index_root, "this_thread.html", "std::this_thread::"), 91*12c85518Srobert ] 92*12c85518Srobert elif args.symbols == 'cpp_removed': 93*12c85518Srobert page_root = os.path.join(args.cppreference, "en", "cpp") 94*12c85518Srobert symbol_index_root = os.path.join(page_root, "symbol_index") 95*12c85518Srobert parse_pages = [(symbol_index_root, "zombie_names.html", "std::")] 96*12c85518Srobert elif args.symbols == 'c': 97*12c85518Srobert page_root = os.path.join(args.cppreference, "en", "c") 98*12c85518Srobert symbol_index_root = page_root 99*12c85518Srobert parse_pages = [(page_root, "index.html", None)] 100*12c85518Srobert 101*12c85518Srobert if not os.path.exists(symbol_index_root): 102*12c85518Srobert exit("Path %s doesn't exist!" % symbol_index_root) 103*12c85518Srobert 104*12c85518Srobert symbols = cppreference_parser.GetSymbols(parse_pages) 105*12c85518Srobert 106*12c85518Srobert # We don't have version information from the unzipped offline HTML files. 107*12c85518Srobert # so we use the modified time of the symbol_index.html as the version. 108*12c85518Srobert index_page_path = os.path.join(page_root, "index.html") 109*12c85518Srobert cppreference_modified_date = datetime.datetime.fromtimestamp( 110*12c85518Srobert os.stat(index_page_path).st_mtime).strftime('%Y-%m-%d') 111*12c85518Srobert print(CODE_PREFIX % (args.symbols.upper(), cppreference_modified_date)) 112*12c85518Srobert for symbol in symbols: 113*12c85518Srobert if len(symbol.headers) == 1: 114*12c85518Srobert # SYMBOL(unqualified_name, namespace, header) 115*12c85518Srobert print("SYMBOL(%s, %s, %s)" % (symbol.name, symbol.namespace, 116*12c85518Srobert symbol.headers[0])) 117*12c85518Srobert elif len(symbol.headers) == 0: 118*12c85518Srobert sys.stderr.write("No header found for symbol %s\n" % symbol.name) 119*12c85518Srobert else: 120*12c85518Srobert # FIXME: support symbols with multiple headers (e.g. std::move). 121*12c85518Srobert sys.stderr.write("Ambiguous header for symbol %s: %s\n" % ( 122*12c85518Srobert symbol.name, ', '.join(symbol.headers))) 123*12c85518Srobert 124*12c85518Srobert 125*12c85518Srobertif __name__ == '__main__': 126*12c85518Srobert main() 127