1*18fd37a7SXin LI /* Analyze file differences for GNU DIFF.
2*18fd37a7SXin LI
3*18fd37a7SXin LI Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
4*18fd37a7SXin LI 2004 Free Software Foundation, Inc.
5*18fd37a7SXin LI
6*18fd37a7SXin LI This file is part of GNU DIFF.
7*18fd37a7SXin LI
8*18fd37a7SXin LI GNU DIFF is free software; you can redistribute it and/or modify
9*18fd37a7SXin LI it under the terms of the GNU General Public License as published by
10*18fd37a7SXin LI the Free Software Foundation; either version 2, or (at your option)
11*18fd37a7SXin LI any later version.
12*18fd37a7SXin LI
13*18fd37a7SXin LI GNU DIFF is distributed in the hope that it will be useful,
14*18fd37a7SXin LI but WITHOUT ANY WARRANTY; without even the implied warranty of
15*18fd37a7SXin LI MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16*18fd37a7SXin LI GNU General Public License for more details.
17*18fd37a7SXin LI
18*18fd37a7SXin LI You should have received a copy of the GNU General Public License
19*18fd37a7SXin LI along with this program; see the file COPYING.
20*18fd37a7SXin LI If not, write to the Free Software Foundation,
21*18fd37a7SXin LI 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
22*18fd37a7SXin LI
23*18fd37a7SXin LI /* The basic algorithm is described in:
24*18fd37a7SXin LI "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
25*18fd37a7SXin LI Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
26*18fd37a7SXin LI see especially section 4.2, which describes the variation used below.
27*18fd37a7SXin LI Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
28*18fd37a7SXin LI heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
29*18fd37a7SXin LI at the price of producing suboptimal output for large inputs with
30*18fd37a7SXin LI many differences.
31*18fd37a7SXin LI
32*18fd37a7SXin LI The basic algorithm was independently discovered as described in:
33*18fd37a7SXin LI "Algorithms for Approximate String Matching", E. Ukkonen,
34*18fd37a7SXin LI Information and Control Vol. 64, 1985, pp. 100-118. */
35*18fd37a7SXin LI
36*18fd37a7SXin LI #include "diff.h"
37*18fd37a7SXin LI #include <cmpbuf.h>
38*18fd37a7SXin LI #include <error.h>
39*18fd37a7SXin LI #include <file-type.h>
40*18fd37a7SXin LI #include <xalloc.h>
41*18fd37a7SXin LI
42*18fd37a7SXin LI static lin *xvec, *yvec; /* Vectors being compared. */
43*18fd37a7SXin LI static lin *fdiag; /* Vector, indexed by diagonal, containing
44*18fd37a7SXin LI 1 + the X coordinate of the point furthest
45*18fd37a7SXin LI along the given diagonal in the forward
46*18fd37a7SXin LI search of the edit matrix. */
47*18fd37a7SXin LI static lin *bdiag; /* Vector, indexed by diagonal, containing
48*18fd37a7SXin LI the X coordinate of the point furthest
49*18fd37a7SXin LI along the given diagonal in the backward
50*18fd37a7SXin LI search of the edit matrix. */
51*18fd37a7SXin LI static lin too_expensive; /* Edit scripts longer than this are too
52*18fd37a7SXin LI expensive to compute. */
53*18fd37a7SXin LI
54*18fd37a7SXin LI #define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'. */
55*18fd37a7SXin LI
56*18fd37a7SXin LI struct partition
57*18fd37a7SXin LI {
58*18fd37a7SXin LI lin xmid, ymid; /* Midpoints of this partition. */
59*18fd37a7SXin LI bool lo_minimal; /* Nonzero if low half will be analyzed minimally. */
60*18fd37a7SXin LI bool hi_minimal; /* Likewise for high half. */
61*18fd37a7SXin LI };
62*18fd37a7SXin LI
63*18fd37a7SXin LI /* Find the midpoint of the shortest edit script for a specified
64*18fd37a7SXin LI portion of the two files.
65*18fd37a7SXin LI
66*18fd37a7SXin LI Scan from the beginnings of the files, and simultaneously from the ends,
67*18fd37a7SXin LI doing a breadth-first search through the space of edit-sequence.
68*18fd37a7SXin LI When the two searches meet, we have found the midpoint of the shortest
69*18fd37a7SXin LI edit sequence.
70*18fd37a7SXin LI
71*18fd37a7SXin LI If FIND_MINIMAL is nonzero, find the minimal edit script regardless
72*18fd37a7SXin LI of expense. Otherwise, if the search is too expensive, use
73*18fd37a7SXin LI heuristics to stop the search and report a suboptimal answer.
74*18fd37a7SXin LI
75*18fd37a7SXin LI Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
76*18fd37a7SXin LI XMID - YMID equals the number of inserted lines minus the number
77*18fd37a7SXin LI of deleted lines (counting only lines before the midpoint).
78*18fd37a7SXin LI
79*18fd37a7SXin LI Set PART->lo_minimal to true iff the minimal edit script for the
80*18fd37a7SXin LI left half of the partition is known; similarly for PART->hi_minimal.
81*18fd37a7SXin LI
82*18fd37a7SXin LI This function assumes that the first lines of the specified portions
83*18fd37a7SXin LI of the two files do not match, and likewise that the last lines do not
84*18fd37a7SXin LI match. The caller must trim matching lines from the beginning and end
85*18fd37a7SXin LI of the portions it is going to specify.
86*18fd37a7SXin LI
87*18fd37a7SXin LI If we return the "wrong" partitions,
88*18fd37a7SXin LI the worst this can do is cause suboptimal diff output.
89*18fd37a7SXin LI It cannot cause incorrect diff output. */
90*18fd37a7SXin LI
91*18fd37a7SXin LI static void
diag(lin xoff,lin xlim,lin yoff,lin ylim,bool find_minimal,struct partition * part)92*18fd37a7SXin LI diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal,
93*18fd37a7SXin LI struct partition *part)
94*18fd37a7SXin LI {
95*18fd37a7SXin LI lin *const fd = fdiag; /* Give the compiler a chance. */
96*18fd37a7SXin LI lin *const bd = bdiag; /* Additional help for the compiler. */
97*18fd37a7SXin LI lin const *const xv = xvec; /* Still more help for the compiler. */
98*18fd37a7SXin LI lin const *const yv = yvec; /* And more and more . . . */
99*18fd37a7SXin LI lin const dmin = xoff - ylim; /* Minimum valid diagonal. */
100*18fd37a7SXin LI lin const dmax = xlim - yoff; /* Maximum valid diagonal. */
101*18fd37a7SXin LI lin const fmid = xoff - yoff; /* Center diagonal of top-down search. */
102*18fd37a7SXin LI lin const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
103*18fd37a7SXin LI lin fmin = fmid, fmax = fmid; /* Limits of top-down search. */
104*18fd37a7SXin LI lin bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */
105*18fd37a7SXin LI lin c; /* Cost. */
106*18fd37a7SXin LI bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
107*18fd37a7SXin LI diagonal with respect to the northwest. */
108*18fd37a7SXin LI
109*18fd37a7SXin LI fd[fmid] = xoff;
110*18fd37a7SXin LI bd[bmid] = xlim;
111*18fd37a7SXin LI
112*18fd37a7SXin LI for (c = 1;; ++c)
113*18fd37a7SXin LI {
114*18fd37a7SXin LI lin d; /* Active diagonal. */
115*18fd37a7SXin LI bool big_snake = false;
116*18fd37a7SXin LI
117*18fd37a7SXin LI /* Extend the top-down search by an edit step in each diagonal. */
118*18fd37a7SXin LI fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
119*18fd37a7SXin LI fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
120*18fd37a7SXin LI for (d = fmax; d >= fmin; d -= 2)
121*18fd37a7SXin LI {
122*18fd37a7SXin LI lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
123*18fd37a7SXin LI
124*18fd37a7SXin LI if (tlo >= thi)
125*18fd37a7SXin LI x = tlo + 1;
126*18fd37a7SXin LI else
127*18fd37a7SXin LI x = thi;
128*18fd37a7SXin LI oldx = x;
129*18fd37a7SXin LI y = x - d;
130*18fd37a7SXin LI while (x < xlim && y < ylim && xv[x] == yv[y])
131*18fd37a7SXin LI ++x, ++y;
132*18fd37a7SXin LI if (x - oldx > SNAKE_LIMIT)
133*18fd37a7SXin LI big_snake = true;
134*18fd37a7SXin LI fd[d] = x;
135*18fd37a7SXin LI if (odd && bmin <= d && d <= bmax && bd[d] <= x)
136*18fd37a7SXin LI {
137*18fd37a7SXin LI part->xmid = x;
138*18fd37a7SXin LI part->ymid = y;
139*18fd37a7SXin LI part->lo_minimal = part->hi_minimal = true;
140*18fd37a7SXin LI return;
141*18fd37a7SXin LI }
142*18fd37a7SXin LI }
143*18fd37a7SXin LI
144*18fd37a7SXin LI /* Similarly extend the bottom-up search. */
145*18fd37a7SXin LI bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin;
146*18fd37a7SXin LI bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax;
147*18fd37a7SXin LI for (d = bmax; d >= bmin; d -= 2)
148*18fd37a7SXin LI {
149*18fd37a7SXin LI lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
150*18fd37a7SXin LI
151*18fd37a7SXin LI if (tlo < thi)
152*18fd37a7SXin LI x = tlo;
153*18fd37a7SXin LI else
154*18fd37a7SXin LI x = thi - 1;
155*18fd37a7SXin LI oldx = x;
156*18fd37a7SXin LI y = x - d;
157*18fd37a7SXin LI while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
158*18fd37a7SXin LI --x, --y;
159*18fd37a7SXin LI if (oldx - x > SNAKE_LIMIT)
160*18fd37a7SXin LI big_snake = true;
161*18fd37a7SXin LI bd[d] = x;
162*18fd37a7SXin LI if (!odd && fmin <= d && d <= fmax && x <= fd[d])
163*18fd37a7SXin LI {
164*18fd37a7SXin LI part->xmid = x;
165*18fd37a7SXin LI part->ymid = y;
166*18fd37a7SXin LI part->lo_minimal = part->hi_minimal = true;
167*18fd37a7SXin LI return;
168*18fd37a7SXin LI }
169*18fd37a7SXin LI }
170*18fd37a7SXin LI
171*18fd37a7SXin LI if (find_minimal)
172*18fd37a7SXin LI continue;
173*18fd37a7SXin LI
174*18fd37a7SXin LI /* Heuristic: check occasionally for a diagonal that has made
175*18fd37a7SXin LI lots of progress compared with the edit distance.
176*18fd37a7SXin LI If we have any such, find the one that has made the most
177*18fd37a7SXin LI progress and return it as if it had succeeded.
178*18fd37a7SXin LI
179*18fd37a7SXin LI With this heuristic, for files with a constant small density
180*18fd37a7SXin LI of changes, the algorithm is linear in the file size. */
181*18fd37a7SXin LI
182*18fd37a7SXin LI if (200 < c && big_snake && speed_large_files)
183*18fd37a7SXin LI {
184*18fd37a7SXin LI lin best = 0;
185*18fd37a7SXin LI
186*18fd37a7SXin LI for (d = fmax; d >= fmin; d -= 2)
187*18fd37a7SXin LI {
188*18fd37a7SXin LI lin dd = d - fmid;
189*18fd37a7SXin LI lin x = fd[d];
190*18fd37a7SXin LI lin y = x - d;
191*18fd37a7SXin LI lin v = (x - xoff) * 2 - dd;
192*18fd37a7SXin LI if (v > 12 * (c + (dd < 0 ? -dd : dd)))
193*18fd37a7SXin LI {
194*18fd37a7SXin LI if (v > best
195*18fd37a7SXin LI && xoff + SNAKE_LIMIT <= x && x < xlim
196*18fd37a7SXin LI && yoff + SNAKE_LIMIT <= y && y < ylim)
197*18fd37a7SXin LI {
198*18fd37a7SXin LI /* We have a good enough best diagonal;
199*18fd37a7SXin LI now insist that it end with a significant snake. */
200*18fd37a7SXin LI int k;
201*18fd37a7SXin LI
202*18fd37a7SXin LI for (k = 1; xv[x - k] == yv[y - k]; k++)
203*18fd37a7SXin LI if (k == SNAKE_LIMIT)
204*18fd37a7SXin LI {
205*18fd37a7SXin LI best = v;
206*18fd37a7SXin LI part->xmid = x;
207*18fd37a7SXin LI part->ymid = y;
208*18fd37a7SXin LI break;
209*18fd37a7SXin LI }
210*18fd37a7SXin LI }
211*18fd37a7SXin LI }
212*18fd37a7SXin LI }
213*18fd37a7SXin LI if (best > 0)
214*18fd37a7SXin LI {
215*18fd37a7SXin LI part->lo_minimal = true;
216*18fd37a7SXin LI part->hi_minimal = false;
217*18fd37a7SXin LI return;
218*18fd37a7SXin LI }
219*18fd37a7SXin LI
220*18fd37a7SXin LI best = 0;
221*18fd37a7SXin LI for (d = bmax; d >= bmin; d -= 2)
222*18fd37a7SXin LI {
223*18fd37a7SXin LI lin dd = d - bmid;
224*18fd37a7SXin LI lin x = bd[d];
225*18fd37a7SXin LI lin y = x - d;
226*18fd37a7SXin LI lin v = (xlim - x) * 2 + dd;
227*18fd37a7SXin LI if (v > 12 * (c + (dd < 0 ? -dd : dd)))
228*18fd37a7SXin LI {
229*18fd37a7SXin LI if (v > best
230*18fd37a7SXin LI && xoff < x && x <= xlim - SNAKE_LIMIT
231*18fd37a7SXin LI && yoff < y && y <= ylim - SNAKE_LIMIT)
232*18fd37a7SXin LI {
233*18fd37a7SXin LI /* We have a good enough best diagonal;
234*18fd37a7SXin LI now insist that it end with a significant snake. */
235*18fd37a7SXin LI int k;
236*18fd37a7SXin LI
237*18fd37a7SXin LI for (k = 0; xv[x + k] == yv[y + k]; k++)
238*18fd37a7SXin LI if (k == SNAKE_LIMIT - 1)
239*18fd37a7SXin LI {
240*18fd37a7SXin LI best = v;
241*18fd37a7SXin LI part->xmid = x;
242*18fd37a7SXin LI part->ymid = y;
243*18fd37a7SXin LI break;
244*18fd37a7SXin LI }
245*18fd37a7SXin LI }
246*18fd37a7SXin LI }
247*18fd37a7SXin LI }
248*18fd37a7SXin LI if (best > 0)
249*18fd37a7SXin LI {
250*18fd37a7SXin LI part->lo_minimal = false;
251*18fd37a7SXin LI part->hi_minimal = true;
252*18fd37a7SXin LI return;
253*18fd37a7SXin LI }
254*18fd37a7SXin LI }
255*18fd37a7SXin LI
256*18fd37a7SXin LI /* Heuristic: if we've gone well beyond the call of duty,
257*18fd37a7SXin LI give up and report halfway between our best results so far. */
258*18fd37a7SXin LI if (c >= too_expensive)
259*18fd37a7SXin LI {
260*18fd37a7SXin LI lin fxybest, fxbest;
261*18fd37a7SXin LI lin bxybest, bxbest;
262*18fd37a7SXin LI
263*18fd37a7SXin LI fxbest = bxbest = 0; /* Pacify `gcc -Wall'. */
264*18fd37a7SXin LI
265*18fd37a7SXin LI /* Find forward diagonal that maximizes X + Y. */
266*18fd37a7SXin LI fxybest = -1;
267*18fd37a7SXin LI for (d = fmax; d >= fmin; d -= 2)
268*18fd37a7SXin LI {
269*18fd37a7SXin LI lin x = MIN (fd[d], xlim);
270*18fd37a7SXin LI lin y = x - d;
271*18fd37a7SXin LI if (ylim < y)
272*18fd37a7SXin LI x = ylim + d, y = ylim;
273*18fd37a7SXin LI if (fxybest < x + y)
274*18fd37a7SXin LI {
275*18fd37a7SXin LI fxybest = x + y;
276*18fd37a7SXin LI fxbest = x;
277*18fd37a7SXin LI }
278*18fd37a7SXin LI }
279*18fd37a7SXin LI
280*18fd37a7SXin LI /* Find backward diagonal that minimizes X + Y. */
281*18fd37a7SXin LI bxybest = LIN_MAX;
282*18fd37a7SXin LI for (d = bmax; d >= bmin; d -= 2)
283*18fd37a7SXin LI {
284*18fd37a7SXin LI lin x = MAX (xoff, bd[d]);
285*18fd37a7SXin LI lin y = x - d;
286*18fd37a7SXin LI if (y < yoff)
287*18fd37a7SXin LI x = yoff + d, y = yoff;
288*18fd37a7SXin LI if (x + y < bxybest)
289*18fd37a7SXin LI {
290*18fd37a7SXin LI bxybest = x + y;
291*18fd37a7SXin LI bxbest = x;
292*18fd37a7SXin LI }
293*18fd37a7SXin LI }
294*18fd37a7SXin LI
295*18fd37a7SXin LI /* Use the better of the two diagonals. */
296*18fd37a7SXin LI if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
297*18fd37a7SXin LI {
298*18fd37a7SXin LI part->xmid = fxbest;
299*18fd37a7SXin LI part->ymid = fxybest - fxbest;
300*18fd37a7SXin LI part->lo_minimal = true;
301*18fd37a7SXin LI part->hi_minimal = false;
302*18fd37a7SXin LI }
303*18fd37a7SXin LI else
304*18fd37a7SXin LI {
305*18fd37a7SXin LI part->xmid = bxbest;
306*18fd37a7SXin LI part->ymid = bxybest - bxbest;
307*18fd37a7SXin LI part->lo_minimal = false;
308*18fd37a7SXin LI part->hi_minimal = true;
309*18fd37a7SXin LI }
310*18fd37a7SXin LI return;
311*18fd37a7SXin LI }
312*18fd37a7SXin LI }
313*18fd37a7SXin LI }
314*18fd37a7SXin LI
315*18fd37a7SXin LI /* Compare in detail contiguous subsequences of the two files
316*18fd37a7SXin LI which are known, as a whole, to match each other.
317*18fd37a7SXin LI
318*18fd37a7SXin LI The results are recorded in the vectors files[N].changed, by
319*18fd37a7SXin LI storing 1 in the element for each line that is an insertion or deletion.
320*18fd37a7SXin LI
321*18fd37a7SXin LI The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
322*18fd37a7SXin LI
323*18fd37a7SXin LI Note that XLIM, YLIM are exclusive bounds.
324*18fd37a7SXin LI All line numbers are origin-0 and discarded lines are not counted.
325*18fd37a7SXin LI
326*18fd37a7SXin LI If FIND_MINIMAL, find a minimal difference no matter how
327*18fd37a7SXin LI expensive it is. */
328*18fd37a7SXin LI
329*18fd37a7SXin LI static void
compareseq(lin xoff,lin xlim,lin yoff,lin ylim,bool find_minimal)330*18fd37a7SXin LI compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal)
331*18fd37a7SXin LI {
332*18fd37a7SXin LI lin const *xv = xvec; /* Help the compiler. */
333*18fd37a7SXin LI lin const *yv = yvec;
334*18fd37a7SXin LI
335*18fd37a7SXin LI /* Slide down the bottom initial diagonal. */
336*18fd37a7SXin LI while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
337*18fd37a7SXin LI ++xoff, ++yoff;
338*18fd37a7SXin LI /* Slide up the top initial diagonal. */
339*18fd37a7SXin LI while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
340*18fd37a7SXin LI --xlim, --ylim;
341*18fd37a7SXin LI
342*18fd37a7SXin LI /* Handle simple cases. */
343*18fd37a7SXin LI if (xoff == xlim)
344*18fd37a7SXin LI while (yoff < ylim)
345*18fd37a7SXin LI files[1].changed[files[1].realindexes[yoff++]] = 1;
346*18fd37a7SXin LI else if (yoff == ylim)
347*18fd37a7SXin LI while (xoff < xlim)
348*18fd37a7SXin LI files[0].changed[files[0].realindexes[xoff++]] = 1;
349*18fd37a7SXin LI else
350*18fd37a7SXin LI {
351*18fd37a7SXin LI struct partition part;
352*18fd37a7SXin LI
353*18fd37a7SXin LI /* Find a point of correspondence in the middle of the files. */
354*18fd37a7SXin LI diag (xoff, xlim, yoff, ylim, find_minimal, &part);
355*18fd37a7SXin LI
356*18fd37a7SXin LI /* Use the partitions to split this problem into subproblems. */
357*18fd37a7SXin LI compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
358*18fd37a7SXin LI compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
359*18fd37a7SXin LI }
360*18fd37a7SXin LI }
361*18fd37a7SXin LI
362*18fd37a7SXin LI /* Discard lines from one file that have no matches in the other file.
363*18fd37a7SXin LI
364*18fd37a7SXin LI A line which is discarded will not be considered by the actual
365*18fd37a7SXin LI comparison algorithm; it will be as if that line were not in the file.
366*18fd37a7SXin LI The file's `realindexes' table maps virtual line numbers
367*18fd37a7SXin LI (which don't count the discarded lines) into real line numbers;
368*18fd37a7SXin LI this is how the actual comparison algorithm produces results
369*18fd37a7SXin LI that are comprehensible when the discarded lines are counted.
370*18fd37a7SXin LI
371*18fd37a7SXin LI When we discard a line, we also mark it as a deletion or insertion
372*18fd37a7SXin LI so that it will be printed in the output. */
373*18fd37a7SXin LI
374*18fd37a7SXin LI static void
discard_confusing_lines(struct file_data filevec[])375*18fd37a7SXin LI discard_confusing_lines (struct file_data filevec[])
376*18fd37a7SXin LI {
377*18fd37a7SXin LI int f;
378*18fd37a7SXin LI lin i;
379*18fd37a7SXin LI char *discarded[2];
380*18fd37a7SXin LI lin *equiv_count[2];
381*18fd37a7SXin LI lin *p;
382*18fd37a7SXin LI
383*18fd37a7SXin LI /* Allocate our results. */
384*18fd37a7SXin LI p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
385*18fd37a7SXin LI * (2 * sizeof *p));
386*18fd37a7SXin LI for (f = 0; f < 2; f++)
387*18fd37a7SXin LI {
388*18fd37a7SXin LI filevec[f].undiscarded = p; p += filevec[f].buffered_lines;
389*18fd37a7SXin LI filevec[f].realindexes = p; p += filevec[f].buffered_lines;
390*18fd37a7SXin LI }
391*18fd37a7SXin LI
392*18fd37a7SXin LI /* Set up equiv_count[F][I] as the number of lines in file F
393*18fd37a7SXin LI that fall in equivalence class I. */
394*18fd37a7SXin LI
395*18fd37a7SXin LI p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
396*18fd37a7SXin LI equiv_count[0] = p;
397*18fd37a7SXin LI equiv_count[1] = p + filevec[0].equiv_max;
398*18fd37a7SXin LI
399*18fd37a7SXin LI for (i = 0; i < filevec[0].buffered_lines; ++i)
400*18fd37a7SXin LI ++equiv_count[0][filevec[0].equivs[i]];
401*18fd37a7SXin LI for (i = 0; i < filevec[1].buffered_lines; ++i)
402*18fd37a7SXin LI ++equiv_count[1][filevec[1].equivs[i]];
403*18fd37a7SXin LI
404*18fd37a7SXin LI /* Set up tables of which lines are going to be discarded. */
405*18fd37a7SXin LI
406*18fd37a7SXin LI discarded[0] = zalloc (filevec[0].buffered_lines
407*18fd37a7SXin LI + filevec[1].buffered_lines);
408*18fd37a7SXin LI discarded[1] = discarded[0] + filevec[0].buffered_lines;
409*18fd37a7SXin LI
410*18fd37a7SXin LI /* Mark to be discarded each line that matches no line of the other file.
411*18fd37a7SXin LI If a line matches many lines, mark it as provisionally discardable. */
412*18fd37a7SXin LI
413*18fd37a7SXin LI for (f = 0; f < 2; f++)
414*18fd37a7SXin LI {
415*18fd37a7SXin LI size_t end = filevec[f].buffered_lines;
416*18fd37a7SXin LI char *discards = discarded[f];
417*18fd37a7SXin LI lin *counts = equiv_count[1 - f];
418*18fd37a7SXin LI lin *equivs = filevec[f].equivs;
419*18fd37a7SXin LI size_t many = 5;
420*18fd37a7SXin LI size_t tem = end / 64;
421*18fd37a7SXin LI
422*18fd37a7SXin LI /* Multiply MANY by approximate square root of number of lines.
423*18fd37a7SXin LI That is the threshold for provisionally discardable lines. */
424*18fd37a7SXin LI while ((tem = tem >> 2) > 0)
425*18fd37a7SXin LI many *= 2;
426*18fd37a7SXin LI
427*18fd37a7SXin LI for (i = 0; i < end; i++)
428*18fd37a7SXin LI {
429*18fd37a7SXin LI lin nmatch;
430*18fd37a7SXin LI if (equivs[i] == 0)
431*18fd37a7SXin LI continue;
432*18fd37a7SXin LI nmatch = counts[equivs[i]];
433*18fd37a7SXin LI if (nmatch == 0)
434*18fd37a7SXin LI discards[i] = 1;
435*18fd37a7SXin LI else if (nmatch > many)
436*18fd37a7SXin LI discards[i] = 2;
437*18fd37a7SXin LI }
438*18fd37a7SXin LI }
439*18fd37a7SXin LI
440*18fd37a7SXin LI /* Don't really discard the provisional lines except when they occur
441*18fd37a7SXin LI in a run of discardables, with nonprovisionals at the beginning
442*18fd37a7SXin LI and end. */
443*18fd37a7SXin LI
444*18fd37a7SXin LI for (f = 0; f < 2; f++)
445*18fd37a7SXin LI {
446*18fd37a7SXin LI lin end = filevec[f].buffered_lines;
447*18fd37a7SXin LI register char *discards = discarded[f];
448*18fd37a7SXin LI
449*18fd37a7SXin LI for (i = 0; i < end; i++)
450*18fd37a7SXin LI {
451*18fd37a7SXin LI /* Cancel provisional discards not in middle of run of discards. */
452*18fd37a7SXin LI if (discards[i] == 2)
453*18fd37a7SXin LI discards[i] = 0;
454*18fd37a7SXin LI else if (discards[i] != 0)
455*18fd37a7SXin LI {
456*18fd37a7SXin LI /* We have found a nonprovisional discard. */
457*18fd37a7SXin LI register lin j;
458*18fd37a7SXin LI lin length;
459*18fd37a7SXin LI lin provisional = 0;
460*18fd37a7SXin LI
461*18fd37a7SXin LI /* Find end of this run of discardable lines.
462*18fd37a7SXin LI Count how many are provisionally discardable. */
463*18fd37a7SXin LI for (j = i; j < end; j++)
464*18fd37a7SXin LI {
465*18fd37a7SXin LI if (discards[j] == 0)
466*18fd37a7SXin LI break;
467*18fd37a7SXin LI if (discards[j] == 2)
468*18fd37a7SXin LI ++provisional;
469*18fd37a7SXin LI }
470*18fd37a7SXin LI
471*18fd37a7SXin LI /* Cancel provisional discards at end, and shrink the run. */
472*18fd37a7SXin LI while (j > i && discards[j - 1] == 2)
473*18fd37a7SXin LI discards[--j] = 0, --provisional;
474*18fd37a7SXin LI
475*18fd37a7SXin LI /* Now we have the length of a run of discardable lines
476*18fd37a7SXin LI whose first and last are not provisional. */
477*18fd37a7SXin LI length = j - i;
478*18fd37a7SXin LI
479*18fd37a7SXin LI /* If 1/4 of the lines in the run are provisional,
480*18fd37a7SXin LI cancel discarding of all provisional lines in the run. */
481*18fd37a7SXin LI if (provisional * 4 > length)
482*18fd37a7SXin LI {
483*18fd37a7SXin LI while (j > i)
484*18fd37a7SXin LI if (discards[--j] == 2)
485*18fd37a7SXin LI discards[j] = 0;
486*18fd37a7SXin LI }
487*18fd37a7SXin LI else
488*18fd37a7SXin LI {
489*18fd37a7SXin LI register lin consec;
490*18fd37a7SXin LI lin minimum = 1;
491*18fd37a7SXin LI lin tem = length >> 2;
492*18fd37a7SXin LI
493*18fd37a7SXin LI /* MINIMUM is approximate square root of LENGTH/4.
494*18fd37a7SXin LI A subrun of two or more provisionals can stand
495*18fd37a7SXin LI when LENGTH is at least 16.
496*18fd37a7SXin LI A subrun of 4 or more can stand when LENGTH >= 64. */
497*18fd37a7SXin LI while (0 < (tem >>= 2))
498*18fd37a7SXin LI minimum <<= 1;
499*18fd37a7SXin LI minimum++;
500*18fd37a7SXin LI
501*18fd37a7SXin LI /* Cancel any subrun of MINIMUM or more provisionals
502*18fd37a7SXin LI within the larger run. */
503*18fd37a7SXin LI for (j = 0, consec = 0; j < length; j++)
504*18fd37a7SXin LI if (discards[i + j] != 2)
505*18fd37a7SXin LI consec = 0;
506*18fd37a7SXin LI else if (minimum == ++consec)
507*18fd37a7SXin LI /* Back up to start of subrun, to cancel it all. */
508*18fd37a7SXin LI j -= consec;
509*18fd37a7SXin LI else if (minimum < consec)
510*18fd37a7SXin LI discards[i + j] = 0;
511*18fd37a7SXin LI
512*18fd37a7SXin LI /* Scan from beginning of run
513*18fd37a7SXin LI until we find 3 or more nonprovisionals in a row
514*18fd37a7SXin LI or until the first nonprovisional at least 8 lines in.
515*18fd37a7SXin LI Until that point, cancel any provisionals. */
516*18fd37a7SXin LI for (j = 0, consec = 0; j < length; j++)
517*18fd37a7SXin LI {
518*18fd37a7SXin LI if (j >= 8 && discards[i + j] == 1)
519*18fd37a7SXin LI break;
520*18fd37a7SXin LI if (discards[i + j] == 2)
521*18fd37a7SXin LI consec = 0, discards[i + j] = 0;
522*18fd37a7SXin LI else if (discards[i + j] == 0)
523*18fd37a7SXin LI consec = 0;
524*18fd37a7SXin LI else
525*18fd37a7SXin LI consec++;
526*18fd37a7SXin LI if (consec == 3)
527*18fd37a7SXin LI break;
528*18fd37a7SXin LI }
529*18fd37a7SXin LI
530*18fd37a7SXin LI /* I advances to the last line of the run. */
531*18fd37a7SXin LI i += length - 1;
532*18fd37a7SXin LI
533*18fd37a7SXin LI /* Same thing, from end. */
534*18fd37a7SXin LI for (j = 0, consec = 0; j < length; j++)
535*18fd37a7SXin LI {
536*18fd37a7SXin LI if (j >= 8 && discards[i - j] == 1)
537*18fd37a7SXin LI break;
538*18fd37a7SXin LI if (discards[i - j] == 2)
539*18fd37a7SXin LI consec = 0, discards[i - j] = 0;
540*18fd37a7SXin LI else if (discards[i - j] == 0)
541*18fd37a7SXin LI consec = 0;
542*18fd37a7SXin LI else
543*18fd37a7SXin LI consec++;
544*18fd37a7SXin LI if (consec == 3)
545*18fd37a7SXin LI break;
546*18fd37a7SXin LI }
547*18fd37a7SXin LI }
548*18fd37a7SXin LI }
549*18fd37a7SXin LI }
550*18fd37a7SXin LI }
551*18fd37a7SXin LI
552*18fd37a7SXin LI /* Actually discard the lines. */
553*18fd37a7SXin LI for (f = 0; f < 2; f++)
554*18fd37a7SXin LI {
555*18fd37a7SXin LI char *discards = discarded[f];
556*18fd37a7SXin LI lin end = filevec[f].buffered_lines;
557*18fd37a7SXin LI lin j = 0;
558*18fd37a7SXin LI for (i = 0; i < end; ++i)
559*18fd37a7SXin LI if (minimal || discards[i] == 0)
560*18fd37a7SXin LI {
561*18fd37a7SXin LI filevec[f].undiscarded[j] = filevec[f].equivs[i];
562*18fd37a7SXin LI filevec[f].realindexes[j++] = i;
563*18fd37a7SXin LI }
564*18fd37a7SXin LI else
565*18fd37a7SXin LI filevec[f].changed[i] = 1;
566*18fd37a7SXin LI filevec[f].nondiscarded_lines = j;
567*18fd37a7SXin LI }
568*18fd37a7SXin LI
569*18fd37a7SXin LI free (discarded[0]);
570*18fd37a7SXin LI free (equiv_count[0]);
571*18fd37a7SXin LI }
572*18fd37a7SXin LI
573*18fd37a7SXin LI /* Adjust inserts/deletes of identical lines to join changes
574*18fd37a7SXin LI as much as possible.
575*18fd37a7SXin LI
576*18fd37a7SXin LI We do something when a run of changed lines include a
577*18fd37a7SXin LI line at one end and have an excluded, identical line at the other.
578*18fd37a7SXin LI We are free to choose which identical line is included.
579*18fd37a7SXin LI `compareseq' usually chooses the one at the beginning,
580*18fd37a7SXin LI but usually it is cleaner to consider the following identical line
581*18fd37a7SXin LI to be the "change". */
582*18fd37a7SXin LI
583*18fd37a7SXin LI static void
shift_boundaries(struct file_data filevec[])584*18fd37a7SXin LI shift_boundaries (struct file_data filevec[])
585*18fd37a7SXin LI {
586*18fd37a7SXin LI int f;
587*18fd37a7SXin LI
588*18fd37a7SXin LI for (f = 0; f < 2; f++)
589*18fd37a7SXin LI {
590*18fd37a7SXin LI char *changed = filevec[f].changed;
591*18fd37a7SXin LI char *other_changed = filevec[1 - f].changed;
592*18fd37a7SXin LI lin const *equivs = filevec[f].equivs;
593*18fd37a7SXin LI lin i = 0;
594*18fd37a7SXin LI lin j = 0;
595*18fd37a7SXin LI lin i_end = filevec[f].buffered_lines;
596*18fd37a7SXin LI
597*18fd37a7SXin LI while (1)
598*18fd37a7SXin LI {
599*18fd37a7SXin LI lin runlength, start, corresponding;
600*18fd37a7SXin LI
601*18fd37a7SXin LI /* Scan forwards to find beginning of another run of changes.
602*18fd37a7SXin LI Also keep track of the corresponding point in the other file. */
603*18fd37a7SXin LI
604*18fd37a7SXin LI while (i < i_end && !changed[i])
605*18fd37a7SXin LI {
606*18fd37a7SXin LI while (other_changed[j++])
607*18fd37a7SXin LI continue;
608*18fd37a7SXin LI i++;
609*18fd37a7SXin LI }
610*18fd37a7SXin LI
611*18fd37a7SXin LI if (i == i_end)
612*18fd37a7SXin LI break;
613*18fd37a7SXin LI
614*18fd37a7SXin LI start = i;
615*18fd37a7SXin LI
616*18fd37a7SXin LI /* Find the end of this run of changes. */
617*18fd37a7SXin LI
618*18fd37a7SXin LI while (changed[++i])
619*18fd37a7SXin LI continue;
620*18fd37a7SXin LI while (other_changed[j])
621*18fd37a7SXin LI j++;
622*18fd37a7SXin LI
623*18fd37a7SXin LI do
624*18fd37a7SXin LI {
625*18fd37a7SXin LI /* Record the length of this run of changes, so that
626*18fd37a7SXin LI we can later determine whether the run has grown. */
627*18fd37a7SXin LI runlength = i - start;
628*18fd37a7SXin LI
629*18fd37a7SXin LI /* Move the changed region back, so long as the
630*18fd37a7SXin LI previous unchanged line matches the last changed one.
631*18fd37a7SXin LI This merges with previous changed regions. */
632*18fd37a7SXin LI
633*18fd37a7SXin LI while (start && equivs[start - 1] == equivs[i - 1])
634*18fd37a7SXin LI {
635*18fd37a7SXin LI changed[--start] = 1;
636*18fd37a7SXin LI changed[--i] = 0;
637*18fd37a7SXin LI while (changed[start - 1])
638*18fd37a7SXin LI start--;
639*18fd37a7SXin LI while (other_changed[--j])
640*18fd37a7SXin LI continue;
641*18fd37a7SXin LI }
642*18fd37a7SXin LI
643*18fd37a7SXin LI /* Set CORRESPONDING to the end of the changed run, at the last
644*18fd37a7SXin LI point where it corresponds to a changed run in the other file.
645*18fd37a7SXin LI CORRESPONDING == I_END means no such point has been found. */
646*18fd37a7SXin LI corresponding = other_changed[j - 1] ? i : i_end;
647*18fd37a7SXin LI
648*18fd37a7SXin LI /* Move the changed region forward, so long as the
649*18fd37a7SXin LI first changed line matches the following unchanged one.
650*18fd37a7SXin LI This merges with following changed regions.
651*18fd37a7SXin LI Do this second, so that if there are no merges,
652*18fd37a7SXin LI the changed region is moved forward as far as possible. */
653*18fd37a7SXin LI
654*18fd37a7SXin LI while (i != i_end && equivs[start] == equivs[i])
655*18fd37a7SXin LI {
656*18fd37a7SXin LI changed[start++] = 0;
657*18fd37a7SXin LI changed[i++] = 1;
658*18fd37a7SXin LI while (changed[i])
659*18fd37a7SXin LI i++;
660*18fd37a7SXin LI while (other_changed[++j])
661*18fd37a7SXin LI corresponding = i;
662*18fd37a7SXin LI }
663*18fd37a7SXin LI }
664*18fd37a7SXin LI while (runlength != i - start);
665*18fd37a7SXin LI
666*18fd37a7SXin LI /* If possible, move the fully-merged run of changes
667*18fd37a7SXin LI back to a corresponding run in the other file. */
668*18fd37a7SXin LI
669*18fd37a7SXin LI while (corresponding < i)
670*18fd37a7SXin LI {
671*18fd37a7SXin LI changed[--start] = 1;
672*18fd37a7SXin LI changed[--i] = 0;
673*18fd37a7SXin LI while (other_changed[--j])
674*18fd37a7SXin LI continue;
675*18fd37a7SXin LI }
676*18fd37a7SXin LI }
677*18fd37a7SXin LI }
678*18fd37a7SXin LI }
679*18fd37a7SXin LI
680*18fd37a7SXin LI /* Cons an additional entry onto the front of an edit script OLD.
681*18fd37a7SXin LI LINE0 and LINE1 are the first affected lines in the two files (origin 0).
682*18fd37a7SXin LI DELETED is the number of lines deleted here from file 0.
683*18fd37a7SXin LI INSERTED is the number of lines inserted here in file 1.
684*18fd37a7SXin LI
685*18fd37a7SXin LI If DELETED is 0 then LINE0 is the number of the line before
686*18fd37a7SXin LI which the insertion was done; vice versa for INSERTED and LINE1. */
687*18fd37a7SXin LI
688*18fd37a7SXin LI static struct change *
add_change(lin line0,lin line1,lin deleted,lin inserted,struct change * old)689*18fd37a7SXin LI add_change (lin line0, lin line1, lin deleted, lin inserted,
690*18fd37a7SXin LI struct change *old)
691*18fd37a7SXin LI {
692*18fd37a7SXin LI struct change *new = xmalloc (sizeof *new);
693*18fd37a7SXin LI
694*18fd37a7SXin LI new->line0 = line0;
695*18fd37a7SXin LI new->line1 = line1;
696*18fd37a7SXin LI new->inserted = inserted;
697*18fd37a7SXin LI new->deleted = deleted;
698*18fd37a7SXin LI new->link = old;
699*18fd37a7SXin LI return new;
700*18fd37a7SXin LI }
701*18fd37a7SXin LI
702*18fd37a7SXin LI /* Scan the tables of which lines are inserted and deleted,
703*18fd37a7SXin LI producing an edit script in reverse order. */
704*18fd37a7SXin LI
705*18fd37a7SXin LI static struct change *
build_reverse_script(struct file_data const filevec[])706*18fd37a7SXin LI build_reverse_script (struct file_data const filevec[])
707*18fd37a7SXin LI {
708*18fd37a7SXin LI struct change *script = 0;
709*18fd37a7SXin LI char *changed0 = filevec[0].changed;
710*18fd37a7SXin LI char *changed1 = filevec[1].changed;
711*18fd37a7SXin LI lin len0 = filevec[0].buffered_lines;
712*18fd37a7SXin LI lin len1 = filevec[1].buffered_lines;
713*18fd37a7SXin LI
714*18fd37a7SXin LI /* Note that changedN[len0] does exist, and is 0. */
715*18fd37a7SXin LI
716*18fd37a7SXin LI lin i0 = 0, i1 = 0;
717*18fd37a7SXin LI
718*18fd37a7SXin LI while (i0 < len0 || i1 < len1)
719*18fd37a7SXin LI {
720*18fd37a7SXin LI if (changed0[i0] | changed1[i1])
721*18fd37a7SXin LI {
722*18fd37a7SXin LI lin line0 = i0, line1 = i1;
723*18fd37a7SXin LI
724*18fd37a7SXin LI /* Find # lines changed here in each file. */
725*18fd37a7SXin LI while (changed0[i0]) ++i0;
726*18fd37a7SXin LI while (changed1[i1]) ++i1;
727*18fd37a7SXin LI
728*18fd37a7SXin LI /* Record this change. */
729*18fd37a7SXin LI script = add_change (line0, line1, i0 - line0, i1 - line1, script);
730*18fd37a7SXin LI }
731*18fd37a7SXin LI
732*18fd37a7SXin LI /* We have reached lines in the two files that match each other. */
733*18fd37a7SXin LI i0++, i1++;
734*18fd37a7SXin LI }
735*18fd37a7SXin LI
736*18fd37a7SXin LI return script;
737*18fd37a7SXin LI }
738*18fd37a7SXin LI
739*18fd37a7SXin LI /* Scan the tables of which lines are inserted and deleted,
740*18fd37a7SXin LI producing an edit script in forward order. */
741*18fd37a7SXin LI
742*18fd37a7SXin LI static struct change *
build_script(struct file_data const filevec[])743*18fd37a7SXin LI build_script (struct file_data const filevec[])
744*18fd37a7SXin LI {
745*18fd37a7SXin LI struct change *script = 0;
746*18fd37a7SXin LI char *changed0 = filevec[0].changed;
747*18fd37a7SXin LI char *changed1 = filevec[1].changed;
748*18fd37a7SXin LI lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
749*18fd37a7SXin LI
750*18fd37a7SXin LI /* Note that changedN[-1] does exist, and is 0. */
751*18fd37a7SXin LI
752*18fd37a7SXin LI while (i0 >= 0 || i1 >= 0)
753*18fd37a7SXin LI {
754*18fd37a7SXin LI if (changed0[i0 - 1] | changed1[i1 - 1])
755*18fd37a7SXin LI {
756*18fd37a7SXin LI lin line0 = i0, line1 = i1;
757*18fd37a7SXin LI
758*18fd37a7SXin LI /* Find # lines changed here in each file. */
759*18fd37a7SXin LI while (changed0[i0 - 1]) --i0;
760*18fd37a7SXin LI while (changed1[i1 - 1]) --i1;
761*18fd37a7SXin LI
762*18fd37a7SXin LI /* Record this change. */
763*18fd37a7SXin LI script = add_change (i0, i1, line0 - i0, line1 - i1, script);
764*18fd37a7SXin LI }
765*18fd37a7SXin LI
766*18fd37a7SXin LI /* We have reached lines in the two files that match each other. */
767*18fd37a7SXin LI i0--, i1--;
768*18fd37a7SXin LI }
769*18fd37a7SXin LI
770*18fd37a7SXin LI return script;
771*18fd37a7SXin LI }
772*18fd37a7SXin LI
773*18fd37a7SXin LI /* If CHANGES, briefly report that two files differed.
774*18fd37a7SXin LI Return 2 if trouble, CHANGES otherwise. */
775*18fd37a7SXin LI static int
briefly_report(int changes,struct file_data const filevec[])776*18fd37a7SXin LI briefly_report (int changes, struct file_data const filevec[])
777*18fd37a7SXin LI {
778*18fd37a7SXin LI if (changes)
779*18fd37a7SXin LI {
780*18fd37a7SXin LI char const *label0 = file_label[0] ? file_label[0] : filevec[0].name;
781*18fd37a7SXin LI char const *label1 = file_label[1] ? file_label[1] : filevec[1].name;
782*18fd37a7SXin LI message ("Files %s and %s differ\n", label0, label1);
783*18fd37a7SXin LI if (! brief)
784*18fd37a7SXin LI changes = 2;
785*18fd37a7SXin LI }
786*18fd37a7SXin LI
787*18fd37a7SXin LI return changes;
788*18fd37a7SXin LI }
789*18fd37a7SXin LI
790*18fd37a7SXin LI /* Report the differences of two files. */
791*18fd37a7SXin LI int
diff_2_files(struct comparison * cmp)792*18fd37a7SXin LI diff_2_files (struct comparison *cmp)
793*18fd37a7SXin LI {
794*18fd37a7SXin LI lin diags;
795*18fd37a7SXin LI int f;
796*18fd37a7SXin LI struct change *e, *p;
797*18fd37a7SXin LI struct change *script;
798*18fd37a7SXin LI int changes;
799*18fd37a7SXin LI
800*18fd37a7SXin LI
801*18fd37a7SXin LI /* If we have detected that either file is binary,
802*18fd37a7SXin LI compare the two files as binary. This can happen
803*18fd37a7SXin LI only when the first chunk is read.
804*18fd37a7SXin LI Also, --brief without any --ignore-* options means
805*18fd37a7SXin LI we can speed things up by treating the files as binary. */
806*18fd37a7SXin LI
807*18fd37a7SXin LI if (read_files (cmp->file, files_can_be_treated_as_binary))
808*18fd37a7SXin LI {
809*18fd37a7SXin LI /* Files with different lengths must be different. */
810*18fd37a7SXin LI if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
811*18fd37a7SXin LI && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
812*18fd37a7SXin LI && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
813*18fd37a7SXin LI changes = 1;
814*18fd37a7SXin LI
815*18fd37a7SXin LI /* Standard input equals itself. */
816*18fd37a7SXin LI else if (cmp->file[0].desc == cmp->file[1].desc)
817*18fd37a7SXin LI changes = 0;
818*18fd37a7SXin LI
819*18fd37a7SXin LI else
820*18fd37a7SXin LI /* Scan both files, a buffer at a time, looking for a difference. */
821*18fd37a7SXin LI {
822*18fd37a7SXin LI /* Allocate same-sized buffers for both files. */
823*18fd37a7SXin LI size_t lcm_max = PTRDIFF_MAX - 1;
824*18fd37a7SXin LI size_t buffer_size =
825*18fd37a7SXin LI buffer_lcm (sizeof (word),
826*18fd37a7SXin LI buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
827*18fd37a7SXin LI STAT_BLOCKSIZE (cmp->file[1].stat),
828*18fd37a7SXin LI lcm_max),
829*18fd37a7SXin LI lcm_max);
830*18fd37a7SXin LI for (f = 0; f < 2; f++)
831*18fd37a7SXin LI cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
832*18fd37a7SXin LI
833*18fd37a7SXin LI for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
834*18fd37a7SXin LI {
835*18fd37a7SXin LI /* Read a buffer's worth from both files. */
836*18fd37a7SXin LI for (f = 0; f < 2; f++)
837*18fd37a7SXin LI if (0 <= cmp->file[f].desc)
838*18fd37a7SXin LI file_block_read (&cmp->file[f],
839*18fd37a7SXin LI buffer_size - cmp->file[f].buffered);
840*18fd37a7SXin LI
841*18fd37a7SXin LI /* If the buffers differ, the files differ. */
842*18fd37a7SXin LI if (cmp->file[0].buffered != cmp->file[1].buffered
843*18fd37a7SXin LI || memcmp (cmp->file[0].buffer,
844*18fd37a7SXin LI cmp->file[1].buffer,
845*18fd37a7SXin LI cmp->file[0].buffered))
846*18fd37a7SXin LI {
847*18fd37a7SXin LI changes = 1;
848*18fd37a7SXin LI break;
849*18fd37a7SXin LI }
850*18fd37a7SXin LI
851*18fd37a7SXin LI /* If we reach end of file, the files are the same. */
852*18fd37a7SXin LI if (cmp->file[0].buffered != buffer_size)
853*18fd37a7SXin LI {
854*18fd37a7SXin LI changes = 0;
855*18fd37a7SXin LI break;
856*18fd37a7SXin LI }
857*18fd37a7SXin LI }
858*18fd37a7SXin LI }
859*18fd37a7SXin LI
860*18fd37a7SXin LI changes = briefly_report (changes, cmp->file);
861*18fd37a7SXin LI }
862*18fd37a7SXin LI else
863*18fd37a7SXin LI {
864*18fd37a7SXin LI /* Allocate vectors for the results of comparison:
865*18fd37a7SXin LI a flag for each line of each file, saying whether that line
866*18fd37a7SXin LI is an insertion or deletion.
867*18fd37a7SXin LI Allocate an extra element, always 0, at each end of each vector. */
868*18fd37a7SXin LI
869*18fd37a7SXin LI size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
870*18fd37a7SXin LI char *flag_space = zalloc (s);
871*18fd37a7SXin LI cmp->file[0].changed = flag_space + 1;
872*18fd37a7SXin LI cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
873*18fd37a7SXin LI
874*18fd37a7SXin LI /* Some lines are obviously insertions or deletions
875*18fd37a7SXin LI because they don't match anything. Detect them now, and
876*18fd37a7SXin LI avoid even thinking about them in the main comparison algorithm. */
877*18fd37a7SXin LI
878*18fd37a7SXin LI discard_confusing_lines (cmp->file);
879*18fd37a7SXin LI
880*18fd37a7SXin LI /* Now do the main comparison algorithm, considering just the
881*18fd37a7SXin LI undiscarded lines. */
882*18fd37a7SXin LI
883*18fd37a7SXin LI xvec = cmp->file[0].undiscarded;
884*18fd37a7SXin LI yvec = cmp->file[1].undiscarded;
885*18fd37a7SXin LI diags = (cmp->file[0].nondiscarded_lines
886*18fd37a7SXin LI + cmp->file[1].nondiscarded_lines + 3);
887*18fd37a7SXin LI fdiag = xmalloc (diags * (2 * sizeof *fdiag));
888*18fd37a7SXin LI bdiag = fdiag + diags;
889*18fd37a7SXin LI fdiag += cmp->file[1].nondiscarded_lines + 1;
890*18fd37a7SXin LI bdiag += cmp->file[1].nondiscarded_lines + 1;
891*18fd37a7SXin LI
892*18fd37a7SXin LI /* Set TOO_EXPENSIVE to be approximate square root of input size,
893*18fd37a7SXin LI bounded below by 256. */
894*18fd37a7SXin LI too_expensive = 1;
895*18fd37a7SXin LI for (; diags != 0; diags >>= 2)
896*18fd37a7SXin LI too_expensive <<= 1;
897*18fd37a7SXin LI too_expensive = MAX (256, too_expensive);
898*18fd37a7SXin LI
899*18fd37a7SXin LI files[0] = cmp->file[0];
900*18fd37a7SXin LI files[1] = cmp->file[1];
901*18fd37a7SXin LI
902*18fd37a7SXin LI compareseq (0, cmp->file[0].nondiscarded_lines,
903*18fd37a7SXin LI 0, cmp->file[1].nondiscarded_lines, minimal);
904*18fd37a7SXin LI
905*18fd37a7SXin LI free (fdiag - (cmp->file[1].nondiscarded_lines + 1));
906*18fd37a7SXin LI
907*18fd37a7SXin LI /* Modify the results slightly to make them prettier
908*18fd37a7SXin LI in cases where that can validly be done. */
909*18fd37a7SXin LI
910*18fd37a7SXin LI shift_boundaries (cmp->file);
911*18fd37a7SXin LI
912*18fd37a7SXin LI /* Get the results of comparison in the form of a chain
913*18fd37a7SXin LI of `struct change's -- an edit script. */
914*18fd37a7SXin LI
915*18fd37a7SXin LI if (output_style == OUTPUT_ED)
916*18fd37a7SXin LI script = build_reverse_script (cmp->file);
917*18fd37a7SXin LI else
918*18fd37a7SXin LI script = build_script (cmp->file);
919*18fd37a7SXin LI
920*18fd37a7SXin LI /* Set CHANGES if we had any diffs.
921*18fd37a7SXin LI If some changes are ignored, we must scan the script to decide. */
922*18fd37a7SXin LI if (ignore_blank_lines || ignore_regexp.fastmap)
923*18fd37a7SXin LI {
924*18fd37a7SXin LI struct change *next = script;
925*18fd37a7SXin LI changes = 0;
926*18fd37a7SXin LI
927*18fd37a7SXin LI while (next && changes == 0)
928*18fd37a7SXin LI {
929*18fd37a7SXin LI struct change *this, *end;
930*18fd37a7SXin LI lin first0, last0, first1, last1;
931*18fd37a7SXin LI
932*18fd37a7SXin LI /* Find a set of changes that belong together. */
933*18fd37a7SXin LI this = next;
934*18fd37a7SXin LI end = find_change (next);
935*18fd37a7SXin LI
936*18fd37a7SXin LI /* Disconnect them from the rest of the changes, making them
937*18fd37a7SXin LI a hunk, and remember the rest for next iteration. */
938*18fd37a7SXin LI next = end->link;
939*18fd37a7SXin LI end->link = 0;
940*18fd37a7SXin LI
941*18fd37a7SXin LI /* Determine whether this hunk is really a difference. */
942*18fd37a7SXin LI if (analyze_hunk (this, &first0, &last0, &first1, &last1))
943*18fd37a7SXin LI changes = 1;
944*18fd37a7SXin LI
945*18fd37a7SXin LI /* Reconnect the script so it will all be freed properly. */
946*18fd37a7SXin LI end->link = next;
947*18fd37a7SXin LI }
948*18fd37a7SXin LI }
949*18fd37a7SXin LI else
950*18fd37a7SXin LI changes = (script != 0);
951*18fd37a7SXin LI
952*18fd37a7SXin LI if (brief)
953*18fd37a7SXin LI changes = briefly_report (changes, cmp->file);
954*18fd37a7SXin LI else
955*18fd37a7SXin LI {
956*18fd37a7SXin LI if (changes | !no_diff_means_no_output)
957*18fd37a7SXin LI {
958*18fd37a7SXin LI /* Record info for starting up output,
959*18fd37a7SXin LI to be used if and when we have some output to print. */
960*18fd37a7SXin LI setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
961*18fd37a7SXin LI file_label[1] ? file_label[1] : cmp->file[1].name,
962*18fd37a7SXin LI cmp->parent != 0);
963*18fd37a7SXin LI
964*18fd37a7SXin LI switch (output_style)
965*18fd37a7SXin LI {
966*18fd37a7SXin LI case OUTPUT_CONTEXT:
967*18fd37a7SXin LI print_context_script (script, false);
968*18fd37a7SXin LI break;
969*18fd37a7SXin LI
970*18fd37a7SXin LI case OUTPUT_UNIFIED:
971*18fd37a7SXin LI print_context_script (script, true);
972*18fd37a7SXin LI break;
973*18fd37a7SXin LI
974*18fd37a7SXin LI case OUTPUT_ED:
975*18fd37a7SXin LI print_ed_script (script);
976*18fd37a7SXin LI break;
977*18fd37a7SXin LI
978*18fd37a7SXin LI case OUTPUT_FORWARD_ED:
979*18fd37a7SXin LI pr_forward_ed_script (script);
980*18fd37a7SXin LI break;
981*18fd37a7SXin LI
982*18fd37a7SXin LI case OUTPUT_RCS:
983*18fd37a7SXin LI print_rcs_script (script);
984*18fd37a7SXin LI break;
985*18fd37a7SXin LI
986*18fd37a7SXin LI case OUTPUT_NORMAL:
987*18fd37a7SXin LI print_normal_script (script);
988*18fd37a7SXin LI break;
989*18fd37a7SXin LI
990*18fd37a7SXin LI case OUTPUT_IFDEF:
991*18fd37a7SXin LI print_ifdef_script (script);
992*18fd37a7SXin LI break;
993*18fd37a7SXin LI
994*18fd37a7SXin LI case OUTPUT_SDIFF:
995*18fd37a7SXin LI print_sdiff_script (script);
996*18fd37a7SXin LI break;
997*18fd37a7SXin LI
998*18fd37a7SXin LI default:
999*18fd37a7SXin LI abort ();
1000*18fd37a7SXin LI }
1001*18fd37a7SXin LI
1002*18fd37a7SXin LI finish_output ();
1003*18fd37a7SXin LI }
1004*18fd37a7SXin LI }
1005*18fd37a7SXin LI
1006*18fd37a7SXin LI free (cmp->file[0].undiscarded);
1007*18fd37a7SXin LI
1008*18fd37a7SXin LI free (flag_space);
1009*18fd37a7SXin LI
1010*18fd37a7SXin LI for (f = 0; f < 2; f++)
1011*18fd37a7SXin LI {
1012*18fd37a7SXin LI free (cmp->file[f].equivs);
1013*18fd37a7SXin LI free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
1014*18fd37a7SXin LI }
1015*18fd37a7SXin LI
1016*18fd37a7SXin LI for (e = script; e; e = p)
1017*18fd37a7SXin LI {
1018*18fd37a7SXin LI p = e->link;
1019*18fd37a7SXin LI free (e);
1020*18fd37a7SXin LI }
1021*18fd37a7SXin LI
1022*18fd37a7SXin LI if (! ROBUST_OUTPUT_STYLE (output_style))
1023*18fd37a7SXin LI for (f = 0; f < 2; ++f)
1024*18fd37a7SXin LI if (cmp->file[f].missing_newline)
1025*18fd37a7SXin LI {
1026*18fd37a7SXin LI error (0, 0, "%s: %s\n",
1027*18fd37a7SXin LI file_label[f] ? file_label[f] : cmp->file[f].name,
1028*18fd37a7SXin LI _("No newline at end of file"));
1029*18fd37a7SXin LI changes = 2;
1030*18fd37a7SXin LI }
1031*18fd37a7SXin LI }
1032*18fd37a7SXin LI
1033*18fd37a7SXin LI if (cmp->file[0].buffer != cmp->file[1].buffer)
1034*18fd37a7SXin LI free (cmp->file[0].buffer);
1035*18fd37a7SXin LI free (cmp->file[1].buffer);
1036*18fd37a7SXin LI
1037*18fd37a7SXin LI return changes;
1038*18fd37a7SXin LI }
1039