xref: /netbsd-src/external/bsd/zstd/dist/build/single_file_libs/combine.py (revision 3117ece4fc4a4ca4489ba793710b60b0d26bab6c)
1*3117ece4Schristos#!/usr/bin/env python3
2*3117ece4Schristos
3*3117ece4Schristos# Tool to bundle multiple C/C++ source files, inlining any includes.
4*3117ece4Schristos#
5*3117ece4Schristos# Note: there are two types of exclusion options: the '-x' flag, which besides
6*3117ece4Schristos# excluding a file also adds an #error directive in place of the #include, and
7*3117ece4Schristos# the '-k' flag, which keeps the #include and doesn't inline the file. The
8*3117ece4Schristos# intended use cases are: '-x' for files that would normally be #if'd out, so
9*3117ece4Schristos# features that 100% won't be used in the amalgamated file, for which every
10*3117ece4Schristos# occurrence adds the error, and '-k' for headers that we wish to manually
11*3117ece4Schristos# include, such as a project's public API, for which occurrences after the first
12*3117ece4Schristos# are removed.
13*3117ece4Schristos#
14*3117ece4Schristos# Todo: the error handling could be better, which currently throws and halts
15*3117ece4Schristos# (which is functional just not very friendly).
16*3117ece4Schristos#
17*3117ece4Schristos# Author: Carl Woffenden, Numfum GmbH (this script is released under a CC0 license/Public Domain)
18*3117ece4Schristos
19*3117ece4Schristosimport argparse, re, sys
20*3117ece4Schristos
21*3117ece4Schristosfrom pathlib import Path
22*3117ece4Schristosfrom typing import Any, List, Optional, Pattern, Set, TextIO
23*3117ece4Schristos
24*3117ece4Schristos# Set of file roots when searching (equivalent to -I paths for the compiler).
25*3117ece4Schristosroots: Set[Path] = set()
26*3117ece4Schristos
27*3117ece4Schristos# Set of (canonical) file Path objects to exclude from inlining (and not only
28*3117ece4Schristos# exclude but to add a compiler error directive when they're encountered).
29*3117ece4Schristosexcludes: Set[Path] = set()
30*3117ece4Schristos
31*3117ece4Schristos# Set of (canonical) file Path objects to keep as include directives.
32*3117ece4Schristoskeeps: Set[Path] = set()
33*3117ece4Schristos
34*3117ece4Schristos# Whether to keep the #pragma once directives (unlikely, since this will result
35*3117ece4Schristos# in a warning, but the option is there).
36*3117ece4Schristoskeep_pragma: bool = False
37*3117ece4Schristos
38*3117ece4Schristos# Destination file object (or stdout if no output file was supplied).
39*3117ece4Schristosdestn: TextIO = sys.stdout
40*3117ece4Schristos
41*3117ece4Schristos# Set of file Path objects previously inlined (and to ignore if reencountering).
42*3117ece4Schristosfound: Set[Path] = set()
43*3117ece4Schristos
44*3117ece4Schristos# Compiled regex Pattern to handle "#pragma once" in various formats:
45*3117ece4Schristos#
46*3117ece4Schristos#   #pragma once
47*3117ece4Schristos#     #pragma once
48*3117ece4Schristos#   #  pragma once
49*3117ece4Schristos#   #pragma   once
50*3117ece4Schristos#   #pragma once // comment
51*3117ece4Schristos#
52*3117ece4Schristos# Ignoring commented versions, same as include_regex.
53*3117ece4Schristos#
54*3117ece4Schristospragma_regex: Pattern = re.compile(r'^\s*#\s*pragma\s*once\s*')
55*3117ece4Schristos
56*3117ece4Schristos# Compiled regex Pattern to handle the following type of file includes:
57*3117ece4Schristos#
58*3117ece4Schristos#   #include "file"
59*3117ece4Schristos#     #include "file"
60*3117ece4Schristos#   #  include "file"
61*3117ece4Schristos#   #include   "file"
62*3117ece4Schristos#   #include "file" // comment
63*3117ece4Schristos#   #include "file" // comment with quote "
64*3117ece4Schristos#
65*3117ece4Schristos# And all combinations of, as well as ignoring the following:
66*3117ece4Schristos#
67*3117ece4Schristos#   #include <file>
68*3117ece4Schristos#   //#include "file"
69*3117ece4Schristos#   /*#include "file"*/
70*3117ece4Schristos#
71*3117ece4Schristos# We don't try to catch errors since the compiler will do this (and the code is
72*3117ece4Schristos# expected to be valid before processing) and we don't care what follows the
73*3117ece4Schristos# file (whether it's a valid comment or not, since anything after the quoted
74*3117ece4Schristos# string is ignored)
75*3117ece4Schristos#
76*3117ece4Schristosinclude_regex: Pattern = re.compile(r'^\s*#\s*include\s*"(.+?)"')
77*3117ece4Schristos
78*3117ece4Schristos# Simple tests to prove include_regex's cases.
79*3117ece4Schristos#
80*3117ece4Schristosdef test_match_include() -> bool:
81*3117ece4Schristos    if (include_regex.match('#include "file"')   and
82*3117ece4Schristos        include_regex.match('  #include "file"') and
83*3117ece4Schristos        include_regex.match('#  include "file"') and
84*3117ece4Schristos        include_regex.match('#include   "file"') and
85*3117ece4Schristos        include_regex.match('#include "file" // comment')):
86*3117ece4Schristos            if (not include_regex.match('#include <file>')   and
87*3117ece4Schristos                not include_regex.match('//#include "file"') and
88*3117ece4Schristos                not include_regex.match('/*#include "file"*/')):
89*3117ece4Schristos                    found = include_regex.match('#include "file" // "')
90*3117ece4Schristos                    if (found and found.group(1) == 'file'):
91*3117ece4Schristos                        print('#include match valid')
92*3117ece4Schristos                        return True
93*3117ece4Schristos    return False
94*3117ece4Schristos
95*3117ece4Schristos# Simple tests to prove pragma_regex's cases.
96*3117ece4Schristos#
97*3117ece4Schristosdef test_match_pragma() -> bool:
98*3117ece4Schristos    if (pragma_regex.match('#pragma once')   and
99*3117ece4Schristos        pragma_regex.match('  #pragma once') and
100*3117ece4Schristos        pragma_regex.match('#  pragma once') and
101*3117ece4Schristos        pragma_regex.match('#pragma   once') and
102*3117ece4Schristos        pragma_regex.match('#pragma once // comment')):
103*3117ece4Schristos            if (not pragma_regex.match('//#pragma once') and
104*3117ece4Schristos                not pragma_regex.match('/*#pragma once*/')):
105*3117ece4Schristos                    print('#pragma once match valid')
106*3117ece4Schristos                    return True
107*3117ece4Schristos    return False
108*3117ece4Schristos
109*3117ece4Schristos# Finds 'file'. First the list of 'root' paths are searched, followed by the
110*3117ece4Schristos# currently processing file's 'parent' path, returning a valid Path in
111*3117ece4Schristos# canonical form. If no match is found None is returned.
112*3117ece4Schristos#
113*3117ece4Schristosdef resolve_include(file: str, parent: Optional[Path] = None) -> Optional[Path]:
114*3117ece4Schristos    for root in roots:
115*3117ece4Schristos        found = root.joinpath(file).resolve()
116*3117ece4Schristos        if (found.is_file()):
117*3117ece4Schristos            return found
118*3117ece4Schristos    if (parent):
119*3117ece4Schristos        found = parent.joinpath(file).resolve();
120*3117ece4Schristos    else:
121*3117ece4Schristos        found = Path(file)
122*3117ece4Schristos    if (found.is_file()):
123*3117ece4Schristos        return found
124*3117ece4Schristos    return None
125*3117ece4Schristos
126*3117ece4Schristos# Helper to resolve lists of files. 'file_list' is passed in from the arguments
127*3117ece4Schristos# and each entry resolved to its canonical path (like any include entry, either
128*3117ece4Schristos# from the list of root paths or the owning file's 'parent', which in this case
129*3117ece4Schristos# is case is the input file). The results are stored in 'resolved'.
130*3117ece4Schristos#
131*3117ece4Schristosdef resolve_excluded_files(file_list: Optional[List[str]], resolved: Set[Path], parent: Optional[Path] = None) -> None:
132*3117ece4Schristos    if (file_list):
133*3117ece4Schristos        for filename in file_list:
134*3117ece4Schristos            found = resolve_include(filename, parent)
135*3117ece4Schristos            if (found):
136*3117ece4Schristos                resolved.add(found)
137*3117ece4Schristos            else:
138*3117ece4Schristos                error_line(f'Warning: excluded file not found: {filename}')
139*3117ece4Schristos
140*3117ece4Schristos# Writes 'line' to the open 'destn' (or stdout).
141*3117ece4Schristos#
142*3117ece4Schristosdef write_line(line: str) -> None:
143*3117ece4Schristos    print(line, file=destn)
144*3117ece4Schristos
145*3117ece4Schristos# Logs 'line' to stderr. This is also used for general notifications that we
146*3117ece4Schristos# don't want to go to stdout (so the source can be piped).
147*3117ece4Schristos#
148*3117ece4Schristosdef error_line(line: Any) -> None:
149*3117ece4Schristos    print(line, file=sys.stderr)
150*3117ece4Schristos
151*3117ece4Schristos# Inline the contents of 'file' (with any of its includes also inlined, etc.).
152*3117ece4Schristos#
153*3117ece4Schristos# Note: text encoding errors are ignored and replaced with ? when reading the
154*3117ece4Schristos# input files. This isn't ideal, but it's more than likely in the comments than
155*3117ece4Schristos# code and a) the text editor has probably also failed to read the same content,
156*3117ece4Schristos# and b) the compiler probably did too.
157*3117ece4Schristos#
158*3117ece4Schristosdef add_file(file: Path, file_name: str = None) -> None:
159*3117ece4Schristos    if (file.is_file()):
160*3117ece4Schristos        if (not file_name):
161*3117ece4Schristos            file_name = file.name
162*3117ece4Schristos        error_line(f'Processing: {file_name}')
163*3117ece4Schristos        with file.open('r', errors='replace') as opened:
164*3117ece4Schristos            for line in opened:
165*3117ece4Schristos                line = line.rstrip('\n')
166*3117ece4Schristos                match_include = include_regex.match(line);
167*3117ece4Schristos                if (match_include):
168*3117ece4Schristos                    # We have a quoted include directive so grab the file
169*3117ece4Schristos                    inc_name = match_include.group(1)
170*3117ece4Schristos                    resolved = resolve_include(inc_name, file.parent)
171*3117ece4Schristos                    if (resolved):
172*3117ece4Schristos                        if (resolved in excludes):
173*3117ece4Schristos                            # The file was excluded so error if the compiler uses it
174*3117ece4Schristos                            write_line(f'#error Using excluded file: {inc_name} (re-amalgamate source to fix)')
175*3117ece4Schristos                            error_line(f'Excluding: {inc_name}')
176*3117ece4Schristos                        else:
177*3117ece4Schristos                            if (resolved not in found):
178*3117ece4Schristos                                # The file was not previously encountered
179*3117ece4Schristos                                found.add(resolved)
180*3117ece4Schristos                                if (resolved in keeps):
181*3117ece4Schristos                                    # But the include was flagged to keep as included
182*3117ece4Schristos                                    write_line(f'/**** *NOT* inlining {inc_name} ****/')
183*3117ece4Schristos                                    write_line(line)
184*3117ece4Schristos                                    error_line(f'Not inlining: {inc_name}')
185*3117ece4Schristos                                else:
186*3117ece4Schristos                                    # The file was neither excluded nor seen before so inline it
187*3117ece4Schristos                                    write_line(f'/**** start inlining {inc_name} ****/')
188*3117ece4Schristos                                    add_file(resolved, inc_name)
189*3117ece4Schristos                                    write_line(f'/**** ended inlining {inc_name} ****/')
190*3117ece4Schristos                            else:
191*3117ece4Schristos                                write_line(f'/**** skipping file: {inc_name} ****/')
192*3117ece4Schristos                    else:
193*3117ece4Schristos                        # The include file didn't resolve to a file
194*3117ece4Schristos                        write_line(f'#error Unable to find: {inc_name}')
195*3117ece4Schristos                        error_line(f'Error: Unable to find: {inc_name}')
196*3117ece4Schristos                else:
197*3117ece4Schristos                    # Skip any 'pragma once' directives, otherwise write the source line
198*3117ece4Schristos                    if (keep_pragma or not pragma_regex.match(line)):
199*3117ece4Schristos                        write_line(line)
200*3117ece4Schristos    else:
201*3117ece4Schristos        error_line(f'Error: Invalid file: {file}')
202*3117ece4Schristos
203*3117ece4Schristos# Start here
204*3117ece4Schristosparser = argparse.ArgumentParser(description='Amalgamate Tool', epilog=f'example: {sys.argv[0]} -r ../my/path -r ../other/path -o out.c in.c')
205*3117ece4Schristosparser.add_argument('-r', '--root', action='append', type=Path, help='file root search path')
206*3117ece4Schristosparser.add_argument('-x', '--exclude',  action='append', help='file to completely exclude from inlining')
207*3117ece4Schristosparser.add_argument('-k', '--keep', action='append', help='file to exclude from inlining but keep the include directive')
208*3117ece4Schristosparser.add_argument('-p', '--pragma', action='store_true', default=False, help='keep any "#pragma once" directives (removed by default)')
209*3117ece4Schristosparser.add_argument('-o', '--output', type=argparse.FileType('w'), help='output file (otherwise stdout)')
210*3117ece4Schristosparser.add_argument('input', type=Path, help='input file')
211*3117ece4Schristosargs = parser.parse_args()
212*3117ece4Schristos
213*3117ece4Schristos# Fail early on an invalid input (and store it so we don't recurse)
214*3117ece4Schristosargs.input = args.input.resolve(strict=True)
215*3117ece4Schristosfound.add(args.input)
216*3117ece4Schristos
217*3117ece4Schristos# Resolve all of the root paths upfront (we'll halt here on invalid roots)
218*3117ece4Schristosif (args.root):
219*3117ece4Schristos    for path in args.root:
220*3117ece4Schristos        roots.add(path.resolve(strict=True))
221*3117ece4Schristos
222*3117ece4Schristos# The remaining params: so resolve the excluded files and #pragma once directive
223*3117ece4Schristosresolve_excluded_files(args.exclude, excludes, args.input.parent)
224*3117ece4Schristosresolve_excluded_files(args.keep,    keeps,    args.input.parent)
225*3117ece4Schristoskeep_pragma = args.pragma;
226*3117ece4Schristos
227*3117ece4Schristos# Then recursively process the input file
228*3117ece4Schristostry:
229*3117ece4Schristos    if (args.output):
230*3117ece4Schristos        destn = args.output
231*3117ece4Schristos    add_file(args.input)
232*3117ece4Schristosfinally:
233*3117ece4Schristos    if (destn):
234*3117ece4Schristos        destn.close()
235