xref: /llvm-project/clang/utils/token-delta.py (revision dd3c26a045c081620375a878159f536758baba6e)
1#!/usr/bin/env python
2
3from __future__ import absolute_import, division, print_function
4import os
5import re
6import subprocess
7import sys
8import tempfile
9
10###
11
12
13class DeltaAlgorithm(object):
14    def __init__(self):
15        self.cache = set()
16
17    def test(self, changes):
18        abstract
19
20    ###
21
22    def getTestResult(self, changes):
23        # There is no reason to cache successful tests because we will
24        # always reduce the changeset when we see one.
25
26        changeset = frozenset(changes)
27        if changeset in self.cache:
28            return False
29        elif not self.test(changes):
30            self.cache.add(changeset)
31            return False
32        else:
33            return True
34
35    def run(self, changes, force=False):
36        # Make sure the initial test passes, if not then (a) either
37        # the user doesn't expect monotonicity, and we may end up
38        # doing O(N^2) tests, or (b) the test is wrong. Avoid the
39        # O(N^2) case unless user requests it.
40        if not force:
41            if not self.getTestResult(changes):
42                raise ValueError("Initial test passed to delta fails.")
43
44        # Check empty set first to quickly find poor test functions.
45        if self.getTestResult(set()):
46            return set()
47        else:
48            return self.delta(changes, self.split(changes))
49
50    def split(self, S):
51        """split(set) -> [sets]
52
53        Partition a set into one or two pieces.
54        """
55
56        # There are many ways to split, we could do a better job with more
57        # context information (but then the API becomes grosser).
58        L = list(S)
59        mid = len(L) // 2
60        if mid == 0:
61            return (L,)
62        else:
63            return L[:mid], L[mid:]
64
65    def delta(self, c, sets):
66        # assert(reduce(set.union, sets, set()) == c)
67
68        # If there is nothing left we can remove, we are done.
69        if len(sets) <= 1:
70            return c
71
72        # Look for a passing subset.
73        res = self.search(c, sets)
74        if res is not None:
75            return res
76
77        # Otherwise, partition sets if possible; if not we are done.
78        refined = sum(map(list, map(self.split, sets)), [])
79        if len(refined) == len(sets):
80            return c
81
82        return self.delta(c, refined)
83
84    def search(self, c, sets):
85        for i, S in enumerate(sets):
86            # If test passes on this subset alone, recurse.
87            if self.getTestResult(S):
88                return self.delta(S, self.split(S))
89
90            # Otherwise if we have more than two sets, see if test
91            # pases without this subset.
92            if len(sets) > 2:
93                complement = sum(sets[:i] + sets[i + 1 :], [])
94                if self.getTestResult(complement):
95                    return self.delta(complement, sets[:i] + sets[i + 1 :])
96
97
98###
99
100
101class Token(object):
102    def __init__(self, type, data, flags, file, line, column):
103        self.type = type
104        self.data = data
105        self.flags = flags
106        self.file = file
107        self.line = line
108        self.column = column
109
110
111kTokenRE = re.compile(
112    r"""([a-z_]+) '(.*)'\t(.*)\tLoc=<(.*):(.*):(.*)>""", re.DOTALL | re.MULTILINE
113)
114
115
116def getTokens(path):
117    p = subprocess.Popen(
118        ["clang", "-dump-raw-tokens", path],
119        stdin=subprocess.PIPE,
120        stdout=subprocess.PIPE,
121        stderr=subprocess.PIPE,
122    )
123    out, err = p.communicate()
124
125    tokens = []
126    collect = None
127    for ln in err.split("\n"):
128        # Silly programmers refuse to print in simple machine readable
129        # formats. Whatever.
130        if collect is None:
131            collect = ln
132        else:
133            collect = collect + "\n" + ln
134        if "Loc=<" in ln and ln.endswith(">"):
135            ln, collect = collect, None
136            tokens.append(Token(*kTokenRE.match(ln).groups()))
137
138    return tokens
139
140
141###
142
143
144class TMBDDelta(DeltaAlgorithm):
145    def __init__(self, testProgram, tokenLists, log):
146        def patchName(name, suffix):
147            base, ext = os.path.splitext(name)
148            return base + "." + suffix + ext
149
150        super(TMBDDelta, self).__init__()
151        self.testProgram = testProgram
152        self.tokenLists = tokenLists
153        self.tempFiles = [patchName(f, "tmp") for f, _ in self.tokenLists]
154        self.targetFiles = [patchName(f, "ok") for f, _ in self.tokenLists]
155        self.log = log
156        self.numTests = 0
157
158    def writeFiles(self, changes, fileNames):
159        assert len(fileNames) == len(self.tokenLists)
160        byFile = [[] for i in self.tokenLists]
161        for i, j in changes:
162            byFile[i].append(j)
163
164        for i, (file, tokens) in enumerate(self.tokenLists):
165            f = open(fileNames[i], "w")
166            for j in byFile[i]:
167                f.write(tokens[j])
168            f.close()
169
170        return byFile
171
172    def test(self, changes):
173        self.numTests += 1
174
175        byFile = self.writeFiles(changes, self.tempFiles)
176
177        if self.log:
178            print("TEST - ", end=" ", file=sys.stderr)
179            if self.log > 1:
180                for i, (file, _) in enumerate(self.tokenLists):
181                    indices = byFile[i]
182                    if i:
183                        sys.stderr.write("\n      ")
184                    sys.stderr.write("%s:%d tokens: [" % (file, len(byFile[i])))
185                    prev = None
186                    for j in byFile[i]:
187                        if prev is None or j != prev + 1:
188                            if prev:
189                                sys.stderr.write("%d][" % prev)
190                            sys.stderr.write(str(j))
191                            sys.stderr.write(":")
192                        prev = j
193                    if byFile[i]:
194                        sys.stderr.write(str(byFile[i][-1]))
195                    sys.stderr.write("] ")
196            else:
197                print(
198                    ", ".join(
199                        [
200                            "%s:%d tokens" % (file, len(byFile[i]))
201                            for i, (file, _) in enumerate(self.tokenLists)
202                        ]
203                    ),
204                    end=" ",
205                    file=sys.stderr,
206                )
207
208        p = subprocess.Popen([self.testProgram] + self.tempFiles)
209        res = p.wait() == 0
210
211        if res:
212            self.writeFiles(changes, self.targetFiles)
213
214        if self.log:
215            print("=> %s" % res, file=sys.stderr)
216        else:
217            if res:
218                print("\nSUCCESS (%d tokens)" % len(changes))
219            else:
220                sys.stderr.write(".")
221
222        return res
223
224    def run(self):
225        res = super(TMBDDelta, self).run(
226            [
227                (i, j)
228                for i, (file, tokens) in enumerate(self.tokenLists)
229                for j in range(len(tokens))
230            ]
231        )
232        self.writeFiles(res, self.targetFiles)
233        if not self.log:
234            print(file=sys.stderr)
235        return res
236
237
238def tokenBasedMultiDelta(program, files, log):
239    # Read in the lists of tokens.
240    tokenLists = [(file, [t.data for t in getTokens(file)]) for file in files]
241
242    numTokens = sum([len(tokens) for _, tokens in tokenLists])
243    print("Delta on %s with %d tokens." % (", ".join(files), numTokens))
244
245    tbmd = TMBDDelta(program, tokenLists, log)
246
247    res = tbmd.run()
248
249    print(
250        "Finished %s with %d tokens (in %d tests)."
251        % (", ".join(tbmd.targetFiles), len(res), tbmd.numTests)
252    )
253
254
255def main():
256    from optparse import OptionParser, OptionGroup
257
258    parser = OptionParser("%prog <test program> {files+}")
259    parser.add_option(
260        "",
261        "--debug",
262        dest="debugLevel",
263        help="set debug level [default %default]",
264        action="store",
265        type=int,
266        default=0,
267    )
268    (opts, args) = parser.parse_args()
269
270    if len(args) <= 1:
271        parser.error("Invalid number of arguments.")
272
273    program, files = args[0], args[1:]
274
275    md = tokenBasedMultiDelta(program, files, log=opts.debugLevel)
276
277
278if __name__ == "__main__":
279    try:
280        main()
281    except KeyboardInterrupt:
282        print("Interrupted.", file=sys.stderr)
283        os._exit(1)  # Avoid freeing our giant cache.
284