xref: /llvm-project/clang/tools/include-mapping/gen_std.py (revision e8182029516dae445f21db304953aa5f10880d2d)
1#!/usr/bin/env python3
2# ===- gen_std.py -  ------------------------------------------*- python -*--===#
3#
4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5# See https://llvm.org/LICENSE.txt for license information.
6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7#
8# ===------------------------------------------------------------------------===#
9
10"""gen_std.py is a tool to generate a lookup table (from qualified names to
11include headers) for C/C++ Standard Library symbols by parsing archived HTML
12files from cppreference.
13
14The generated files are located in clang/include/Tooling/Inclusions.
15
16Caveats and FIXMEs:
17  - only symbols directly in "std" namespace are added, we should also add std's
18    subnamespace symbols (e.g. chrono).
19  - symbols with multiple variants or defined in multiple headers aren't added,
20    e.g. std::move, std::swap
21
22Usage:
23  1. Install BeautifulSoup dependency, see instruction:
24       https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-beautiful-soup
25  2. Download cppreference offline HTML files (html_book_20220730.zip in Unofficial Release) at
26       https://en.cppreference.com/w/Cppreference:Archives
27  3. Unzip the zip file from step 2 (e.g., to a "cppreference" directory). You should
28     get a "cppreference/reference" directory.
29  4. Run the command:
30       // Generate C++ symbols
31       python3 gen_std.py -cppreference cppreference/reference -symbols=cpp > StdSymbolMap.inc
32       // Generate C symbols
33       python3 gen_std.py -cppreference cppreference/reference -symbols=c > CSymbolMap.inc
34"""
35
36
37import cppreference_parser
38import argparse
39import datetime
40import os
41import sys
42import re
43
44
45CODE_PREFIX = """\
46//===-- gen_std.py generated file -------------------------------*- C++ -*-===//
47//
48// Used to build a lookup table (qualified names => include headers) for %s
49// Standard Library symbols.
50//
51// This file was generated automatically by
52// clang/tools/include-mapping/gen_std.py, DO NOT EDIT!
53//
54// Generated from cppreference offline HTML book (modified on %s).
55//===----------------------------------------------------------------------===//
56"""
57
58
59def ParseArg():
60    parser = argparse.ArgumentParser(description="Generate StdGen file")
61    parser.add_argument(
62        "-cppreference",
63        metavar="PATH",
64        default="",
65        help="path to the cppreference offline HTML directory",
66        required=True,
67    )
68    parser.add_argument(
69        "-symbols",
70        default="cpp",
71        help="Generate c or cpp (removed) symbols. One of {cpp, c, cpp_removed}.",
72        required=True,
73    )
74    return parser.parse_args()
75
76
77def AdditionalHeadersForIOSymbols(symbol):
78    # IO-related symbols declared in the <iosfwd> header, per C++
79    # [iosfwd.syn 31.3.1]:
80    iosfwd_symbols = [
81        "basic_ios",
82        "basic_streambuf",
83        "basic_istream",
84        "basic_ostream",
85        "basic_iostream",
86        "basic_stringbuf",
87        "basic_istringstream",
88        "basic_ostringstream",
89        "basic_stringstream",
90        "basic_spanbuf",
91        "basic_ispanstream",
92        "basic_ospanstream",
93        "basic_spanstream",
94        "basic_filebuf",
95        "basic_ifstream",
96        "basic_ofstream",
97        "basic_fstream",
98        "basic_syncbuf",
99        "basic_osyncstream",
100        "istreambuf_iterator",
101        "ostreambuf_iterator",
102        "ios",
103        "wios",
104        "streambuf",
105        "istream",
106        "ostream",
107        "iostream",
108        "stringbuf",
109        "istringstream",
110        "ostringstream",
111        "stringstream",
112        "spanbuf",
113        "ispanstream",
114        "ospanstream",
115        "spanstream",
116        "filebuf",
117        "ifstream",
118        "ofstream",
119        "fstream",
120        "syncbuf",
121        "osyncstream",
122        "wstreambuf",
123        "wistream",
124        "wostream",
125        "wiostream",
126        "wstringbuf",
127        "wistringstream",
128        "wostringstream",
129        "wstringstream",
130        "wspanbuf",
131        "wispanstream",
132        "wospanstream",
133        "wspanstream",
134        "wfilebuf",
135        "wifstream",
136        "wofstream",
137        "wfstream",
138        "wsyncbuf",
139        "wosyncstream",
140        "fpos",
141        "streampos",
142        "wstreampos",
143        "u8streampos",
144        "u16streampos",
145        "u32streampos",
146    ]
147    assert len(symbol.headers) == 1
148    sym_header = symbol.headers[0]
149    headers = []
150    # <iostream> is preferred than <iosfwd>
151
152    # <iostream> is an alternative of <streambuf>, <istream>, <ostream>, <ios>.
153    # per C++ [iostream.syn 31.4.1]
154    if sym_header in ["<ios>", "<istream>", "<ostream>", "<streambuf>"]:
155        headers.append("<iostream>")
156
157    if symbol.name in iosfwd_symbols:
158        headers.append("<iosfwd>")
159
160    return headers
161
162
163def GetCCompatibilitySymbols(symbol):
164    # C++ form of the C standard headers.
165    c_compat_headers = {
166        "<cassert>",
167        "<cctype>",
168        "<cerrno>",
169        "<cfenv>",
170        "<cfloat>",
171        "<cinttypes>",
172        "<climits>",
173        "<clocale>",
174        "<cmath>",
175        "<csetjmp>",
176        "<csignal>",
177        "<cstdarg>",
178        "<cstddef>",
179        "<cstdint>",
180        "<cstdio>",
181        "<cstdlib>",
182        "<cstring>",
183        "<ctime>",
184        "<cuchar>",
185        "<cwchar>",
186        "<cwctype>",
187    }
188    # C++ [support.c.headers.other] 17.14.7
189    #    ..., behaves as if each name placed in the standard library namespace by
190    #    the corresponding <cname> header is placed within the global namespace
191    #    scope, except for the functions described in [sf.cmath], the
192    #    std​::​lerp function overloads ([c.math.lerp]), the declaration of
193    #    std​::​byte ([cstddef.syn]), and the functions and function templates
194    #    described in [support.types.byteops].
195    exception_symbols = {
196        "(assoc_)?laguerre[f|l]?",
197        "(assoc_|sph_)?legendre[f|l]?",
198        "beta[f|l]?",
199        "(comp_)?ellint_[1-3][f|l]?",
200        "(cyl_|sph_)?bessel_[i-k][f|l]?",
201        "(cyl_|sph_)?neumann[f|l]?",
202        "expint[f|l]?",
203        "hermite[f|l]?",
204        "riemann_zeta[f|l]?",
205        "lerp",
206        "byte",
207    }
208    assert len(symbol.headers) == 1
209    header = symbol.headers[0]
210    if header not in c_compat_headers:
211        return []
212    if any(re.fullmatch(x, symbol.name) for x in exception_symbols):
213        return []
214
215    # Introduce two more entries, both in the global namespace, one using the
216    # C++-compat header and another using the C header.
217    results = []
218    if symbol.namespace is not None:
219        # avoid printing duplicated entries, for C macros!
220        results.append(cppreference_parser.Symbol(symbol.name, None, [header]))
221    c_header = "<" + header[2:-1] + ".h>"  # <cstdio> => <stdio.h>
222    results.append(cppreference_parser.Symbol(symbol.name, None, [c_header]))
223    return results
224
225
226def main():
227    args = ParseArg()
228    if args.symbols == "cpp":
229        page_root = os.path.join(args.cppreference, "en", "cpp")
230        symbol_index_root = os.path.join(page_root, "symbol_index")
231        parse_pages = [
232            (page_root, "symbol_index.html", "std::"),
233            # std sub-namespace symbols have separated pages.
234            # We don't index std literal operators (e.g.
235            # std::literals::chrono_literals::operator""d), these symbols can't be
236            # accessed by std::<symbol_name>.
237            #
238            # std::placeholders symbols are handled manually in StdSpecialSymbolMap.inc
239            (symbol_index_root, "chrono.html", "std::chrono::"),
240            (symbol_index_root, "execution.html", "std::execution::"),
241            (symbol_index_root, "numbers.html", "std::numbers::"),
242            (symbol_index_root, "filesystem.html", "std::filesystem::"),
243            (symbol_index_root, "pmr.html", "std::pmr::"),
244            (symbol_index_root, "ranges.html", "std::ranges::"),
245
246            (symbol_index_root, "views.html", "std::ranges::views::"),
247            # std::ranges::views can be accessed as std::views.
248            (symbol_index_root, "views.html", "std::views::"),
249
250            (symbol_index_root, "regex_constants.html", "std::regex_constants::"),
251            (symbol_index_root, "this_thread.html", "std::this_thread::"),
252            # Zombie symbols that were available from the Standard Library, but are
253            # removed in the following standards.
254            (symbol_index_root, "zombie_names.html", "std::"),
255            (symbol_index_root, "macro.html", None),
256        ]
257    elif args.symbols == "c":
258        page_root = os.path.join(args.cppreference, "en", "c")
259        symbol_index_root = page_root
260        parse_pages = [(page_root, "index.html", None)]
261
262    if not os.path.exists(symbol_index_root):
263        exit("Path %s doesn't exist!" % symbol_index_root)
264
265    symbols = cppreference_parser.GetSymbols(parse_pages)
266
267    # We don't have version information from the unzipped offline HTML files.
268    # so we use the modified time of the symbol_index.html as the version.
269    index_page_path = os.path.join(page_root, "index.html")
270    cppreference_modified_date = datetime.datetime.fromtimestamp(
271        os.stat(index_page_path).st_mtime
272    ).strftime("%Y-%m-%d")
273    print(CODE_PREFIX % (args.symbols.upper(), cppreference_modified_date))
274    for symbol in symbols:
275        if len(symbol.headers) == 1:
276            augmented_symbols = [symbol]
277            augmented_symbols.extend(GetCCompatibilitySymbols(symbol))
278            for s in augmented_symbols:
279                s.headers.extend(AdditionalHeadersForIOSymbols(s))
280                for header in s.headers:
281                    # SYMBOL(unqualified_name, namespace, header)
282                    print("SYMBOL(%s, %s, %s)" % (s.name, s.namespace, header))
283        elif len(symbol.headers) == 0:
284            sys.stderr.write("No header found for symbol %s\n" % symbol.name)
285        else:
286            # FIXME: support symbols with multiple headers (e.g. std::move).
287            sys.stderr.write(
288                "Ambiguous header for symbol %s: %s\n"
289                % (symbol.name, ", ".join(symbol.headers))
290            )
291
292
293if __name__ == "__main__":
294    main()
295